Total coverage: 173167 (15%)of 1212641
48 48 2 6 23 5 6 55 3 39 10 3 15 21 22 40 13 26 9 18 18 7 5 30 6 6 3 3 6 14 14 36 3 7 27 34 7 4 10 14 21 2 2 68 2 66 23 22 39 1 61 48 2 2 4 2 40 39 13 26 36 1 2 39 39 41 50 23 2 17 5 4 26 28 59 11 12 38 50 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 // SPDX-License-Identifier: GPL-2.0 /* * fs/timerfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * * * Thanks to Thomas Gleixner for code reviews and useful comments. * */ #include <linux/alarmtimer.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/time.h> #include <linux/hrtimer.h> #include <linux/anon_inodes.h> #include <linux/timerfd.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/rcupdate.h> #include <linux/time_namespace.h> struct timerfd_ctx { union { struct hrtimer tmr; struct alarm alarm; } t; ktime_t tintv; ktime_t moffs; wait_queue_head_t wqh; u64 ticks; int clockid; short unsigned expired; short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; spinlock_t cancel_lock; bool might_cancel; }; static LIST_HEAD(cancel_list); static DEFINE_SPINLOCK(cancel_lock); static inline bool isalarm(struct timerfd_ctx *ctx) { return ctx->clockid == CLOCK_REALTIME_ALARM || ctx->clockid == CLOCK_BOOTTIME_ALARM; } /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, * tintv != 0) until the timer is accessed. */ static void timerfd_triggered(struct timerfd_ctx *ctx) { unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1; ctx->ticks++; wake_up_locked_poll(&ctx->wqh, EPOLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); } static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) { struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, t.tmr); timerfd_triggered(ctx); return HRTIMER_NORESTART; } static void timerfd_alarmproc(struct alarm *alarm, ktime_t now) { struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, t.alarm); timerfd_triggered(ctx); } /* * Called when the clock was set to cancel the timers in the cancel * list. This will wake up processes waiting on these timers. The * wake-up requires ctx->ticks to be non zero, therefore we increment * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { ktime_t moffs = ktime_mono_to_real(0); struct timerfd_ctx *ctx; unsigned long flags; rcu_read_lock(); list_for_each_entry_rcu(ctx, &cancel_list, clist) { if (!ctx->might_cancel) continue; spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs != moffs) { ctx->moffs = KTIME_MAX; ctx->ticks++; wake_up_locked_poll(&ctx->wqh, EPOLLIN); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); } rcu_read_unlock(); } static void timerfd_resume_work(struct work_struct *work) { timerfd_clock_was_set(); } static DECLARE_WORK(timerfd_work, timerfd_resume_work); /* * Invoked from timekeeping_resume(). Defer the actual update to work so * timerfd_clock_was_set() runs in task context. */ void timerfd_resume(void) { schedule_work(&timerfd_work); } static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { ctx->might_cancel = false; spin_lock(&cancel_lock); list_del_rcu(&ctx->clist); spin_unlock(&cancel_lock); } } static void timerfd_remove_cancel(struct timerfd_ctx *ctx) { spin_lock(&ctx->cancel_lock); __timerfd_remove_cancel(ctx); spin_unlock(&ctx->cancel_lock); } static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs != KTIME_MAX) return false; ctx->moffs = ktime_mono_to_real(0); return true; } static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { spin_lock(&ctx->cancel_lock); if ((ctx->clockid == CLOCK_REALTIME || ctx->clockid == CLOCK_REALTIME_ALARM) && (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { if (!ctx->might_cancel) { ctx->might_cancel = true; spin_lock(&cancel_lock); list_add_rcu(&ctx->clist, &cancel_list); spin_unlock(&cancel_lock); } } else { __timerfd_remove_cancel(ctx); } spin_unlock(&ctx->cancel_lock); } static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { ktime_t remaining; if (isalarm(ctx)) remaining = alarm_expires_remaining(&ctx->t.alarm); else remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr); return remaining < 0 ? 0: remaining; } static int timerfd_setup(struct timerfd_ctx *ctx, int flags, const struct itimerspec64 *ktmr) { enum hrtimer_mode htmode; ktime_t texp; int clockid = ctx->clockid; htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; texp = timespec64_to_ktime(ktmr->it_value); ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec64_to_ktime(ktmr->it_interval); if (isalarm(ctx)) { alarm_init(&ctx->t.alarm, ctx->clockid == CLOCK_REALTIME_ALARM ? ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); } if (texp != 0) { if (flags & TFD_TIMER_ABSTIME) texp = timens_ktime_to_host(clockid, texp); if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); else alarm_start_relative(&ctx->t.alarm, texp); } else { hrtimer_start(&ctx->t.tmr, texp, htmode); } if (timerfd_canceled(ctx)) return -ECANCELED; } ctx->settime_flags = flags & TFD_SETTIME_FLAGS; return 0; } static int timerfd_release(struct inode *inode, struct file *file) { struct timerfd_ctx *ctx = file->private_data; timerfd_remove_cancel(ctx); if (isalarm(ctx)) alarm_cancel(&ctx->t.alarm); else hrtimer_cancel(&ctx->t.tmr); kfree_rcu(ctx, rcu); return 0; } static __poll_t timerfd_poll(struct file *file, poll_table *wait) { struct timerfd_ctx *ctx = file->private_data; __poll_t events = 0; unsigned long flags; poll_wait(file, &ctx->wqh, wait); spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->ticks) events |= EPOLLIN; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return events; } static ssize_t timerfd_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct timerfd_ctx *ctx = file->private_data; ssize_t res; u64 ticks = 0; if (iov_iter_count(to) < sizeof(ticks)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT) res = -EAGAIN; else res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); /* * If clock has changed, we do not care about the * ticks and we do not rearm the timer. Userspace must * reevaluate anyway. */ if (timerfd_canceled(ctx)) { ctx->ticks = 0; ctx->expired = 0; res = -ECANCELED; } if (ctx->ticks) { ticks = ctx->ticks; if (ctx->expired && ctx->tintv) { /* * If tintv != 0, this is a periodic timer that * needs to be re-armed. We avoid doing it in the timer * callback to avoid DoS attacks specifying a very * short timer period. */ if (isalarm(ctx)) { ticks += alarm_forward_now( &ctx->t.alarm, ctx->tintv) - 1; alarm_restart(&ctx->t.alarm); } else { ticks += hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->t.tmr); } } ctx->expired = 0; ctx->ticks = 0; } spin_unlock_irq(&ctx->wqh.lock); if (ticks) { res = copy_to_iter(&ticks, sizeof(ticks), to); if (!res) res = -EFAULT; } return res; } #ifdef CONFIG_PROC_FS static void timerfd_show(struct seq_file *m, struct file *file) { struct timerfd_ctx *ctx = file->private_data; struct timespec64 value, interval; spin_lock_irq(&ctx->wqh.lock); value = ktime_to_timespec64(timerfd_get_remaining(ctx)); interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "clockid: %d\n" "ticks: %llu\n" "settime flags: 0%o\n" "it_value: (%llu, %llu)\n" "it_interval: (%llu, %llu)\n", ctx->clockid, (unsigned long long)ctx->ticks, ctx->settime_flags, (unsigned long long)value.tv_sec, (unsigned long long)value.tv_nsec, (unsigned long long)interval.tv_sec, (unsigned long long)interval.tv_nsec); } #else #define timerfd_show NULL #endif #ifdef CONFIG_CHECKPOINT_RESTORE static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct timerfd_ctx *ctx = file->private_data; int ret = 0; switch (cmd) { case TFD_IOC_SET_TICKS: { u64 ticks; if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks))) return -EFAULT; if (!ticks) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (!timerfd_canceled(ctx)) { ctx->ticks = ticks; wake_up_locked_poll(&ctx->wqh, EPOLLIN); } else ret = -ECANCELED; spin_unlock_irq(&ctx->wqh.lock); break; } default: ret = -ENOTTY; break; } return ret; } #else #define timerfd_ioctl NULL #endif static const struct file_operations timerfd_fops = { .release = timerfd_release, .poll = timerfd_poll, .read_iter = timerfd_read_iter, .llseek = noop_llseek, .show_fdinfo = timerfd_show, .unlocked_ioctl = timerfd_ioctl, }; SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; struct timerfd_ctx *ctx; struct file *file; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); if ((flags & ~TFD_CREATE_FLAGS) || (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_REALTIME_ALARM && clockid != CLOCK_BOOTTIME && clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; if ((clockid == CLOCK_REALTIME_ALARM || clockid == CLOCK_BOOTTIME_ALARM) && !capable(CAP_WAKE_ALARM)) return -EPERM; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; init_waitqueue_head(&ctx->wqh); spin_lock_init(&ctx->cancel_lock); ctx->clockid = clockid; if (isalarm(ctx)) alarm_init(&ctx->t.alarm, ctx->clockid == CLOCK_REALTIME_ALARM ? ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); ufd = get_unused_fd_flags(flags & TFD_SHARED_FCNTL_FLAGS); if (ufd < 0) { kfree(ctx); return ufd; } file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } fd_install(ufd, file); return ufd; } static int do_timerfd_settime(int ufd, int flags, const struct itimerspec64 *new, struct itimerspec64 *old) { struct timerfd_ctx *ctx; int ret; if ((flags & ~TFD_SETTIME_FLAGS) || !itimerspec64_valid(new)) return -EINVAL; CLASS(fd, f)(ufd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &timerfd_fops) return -EINVAL; ctx = fd_file(f)->private_data; if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) return -EPERM; timerfd_setup_cancel(ctx, flags); /* * We need to stop the existing timer before reprogramming * it to the new values. */ for (;;) { spin_lock_irq(&ctx->wqh.lock); if (isalarm(ctx)) { if (alarm_try_to_cancel(&ctx->t.alarm) >= 0) break; } else { if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0) break; } spin_unlock_irq(&ctx->wqh.lock); if (isalarm(ctx)) hrtimer_cancel_wait_running(&ctx->t.alarm.timer); else hrtimer_cancel_wait_running(&ctx->t.tmr); } /* * If the timer is expired and it's periodic, we need to advance it * because the caller may want to know the previous expiration time. * We do not update "ticks" and "expired" since the timer will be * re-programmed again in the following timerfd_setup() call. */ if (ctx->expired && ctx->tintv) { if (isalarm(ctx)) alarm_forward_now(&ctx->t.alarm, ctx->tintv); else hrtimer_forward_now(&ctx->t.tmr, ctx->tintv); } old->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); old->it_interval = ktime_to_timespec64(ctx->tintv); /* * Re-program the timer to the new value ... */ ret = timerfd_setup(ctx, flags, new); spin_unlock_irq(&ctx->wqh.lock); return ret; } static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) { struct timerfd_ctx *ctx; CLASS(fd, f)(ufd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &timerfd_fops) return -EINVAL; ctx = fd_file(f)->private_data; spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv) { ctx->expired = 0; if (isalarm(ctx)) { ctx->ticks += alarm_forward_now( &ctx->t.alarm, ctx->tintv) - 1; alarm_restart(&ctx->t.alarm); } else { ctx->ticks += hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - 1; hrtimer_restart(&ctx->t.tmr); } } t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec64(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); return 0; } SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, const struct __kernel_itimerspec __user *, utmr, struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 new, old; int ret; if (get_itimerspec64(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; if (otmr && put_itimerspec64(&old, otmr)) return -EFAULT; return ret; } SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct __kernel_itimerspec __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; return put_itimerspec64(&kotmr, otmr) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(timerfd_settime32, int, ufd, int, flags, const struct old_itimerspec32 __user *, utmr, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 new, old; int ret; if (get_old_itimerspec32(&new, utmr)) return -EFAULT; ret = do_timerfd_settime(ufd, flags, &new, &old); if (ret) return ret; if (otmr && put_old_itimerspec32(&old, otmr)) return -EFAULT; return ret; } SYSCALL_DEFINE2(timerfd_gettime32, int, ufd, struct old_itimerspec32 __user *, otmr) { struct itimerspec64 kotmr; int ret = do_timerfd_gettime(ufd, &kotmr); if (ret) return ret; return put_old_itimerspec32(&kotmr, otmr) ? -EFAULT : 0; } #endif
3 2 1 12 9 3 11 1 1 5 12 4 11 1 12 1 12 1 1 12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "sqpoll.h" #include "fdinfo.h" #include "cancel.h" #include "rsrc.h" #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m, const char *tracking_strategy) { seq_puts(m, "NAPI:\tenabled\n"); seq_printf(m, "napi tracking:\t%s\n", tracking_strategy); seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); if (ctx->napi_prefer_busy_poll) seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); else seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); } static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { unsigned int mode = READ_ONCE(ctx->napi_track_mode); switch (mode) { case IO_URING_NAPI_TRACKING_INACTIVE: seq_puts(m, "NAPI:\tdisabled\n"); break; case IO_URING_NAPI_TRACKING_DYNAMIC: common_tracking_show_fdinfo(ctx, m, "dynamic"); break; case IO_URING_NAPI_TRACKING_STATIC: common_tracking_show_fdinfo(ctx, m, "static"); break; default: seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode); } } #else static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { } #endif static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_head = READ_ONCE(r->sq.head); unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); unsigned int cq_shift = 0; unsigned int sq_shift = 0; unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; unsigned int i; if (ctx->flags & IORING_SETUP_CQE32) cq_shift = 1; if (ctx->flags & IORING_SETUP_SQE128) sq_shift = 1; /* * we may get imprecise sqe and cqe info if uring is actively running * since we get cached_sq_head and cached_cq_tail without uring_lock * and sq_tail and cq_head are changed by userspace. But it's ok since * we usually use these info when it is stuck. */ seq_printf(m, "SqMask:\t0x%x\n", sq_mask); seq_printf(m, "SqHead:\t%u\n", sq_head); seq_printf(m, "SqTail:\t%u\n", sq_tail); seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head)); seq_printf(m, "CqMask:\t0x%x\n", cq_mask); seq_printf(m, "CqHead:\t%u\n", cq_head); seq_printf(m, "CqTail:\t%u\n", cq_tail); seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { unsigned int entry = i + sq_head; struct io_uring_sqe *sqe; unsigned int sq_idx; if (ctx->flags & IORING_SETUP_NO_SQARRAY) break; sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; sqe = &ctx->sq_sqes[sq_idx << sq_shift]; seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " "addr:0x%llx, rw_flags:0x%x, buf_index:%d " "user_data:%llu", sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, sqe->flags, (unsigned long long) sqe->off, (unsigned long long) sqe->addr, sqe->rw_flags, sqe->buf_index, sqe->user_data); if (sq_shift) { u64 *sqeb = (void *) (sqe + 1); int size = sizeof(struct io_uring_sqe) / sizeof(u64); int j; for (j = 0; j < size; j++) { seq_printf(m, ", e%d:0x%llx", j, (unsigned long long) *sqeb); sqeb++; } } seq_printf(m, "\n"); } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head; struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", entry & cq_mask, cqe->user_data, cqe->res, cqe->flags); if (cq_shift) seq_printf(m, ", extra1:%llu, extra2:%llu\n", cqe->big_cqe[0], cqe->big_cqe[1]); seq_printf(m, "\n"); } if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; /* * sq->thread might be NULL if we raced with the sqpoll * thread termination. */ if (sq->thread) { sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; getrusage(sq->thread, RUSAGE_SELF, &sq_usage); sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 + sq_usage.ru_stime.tv_usec); sq_work_time = sq->work_time; } } seq_printf(m, "SqThread:\t%d\n", sq_pid); seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); for (i = 0; i < ctx->file_table.data.nr; i++) { struct file *f = NULL; if (ctx->file_table.data.nodes[i]) f = io_slot_file(ctx->file_table.data.nodes[i]); if (f) { seq_printf(m, "%5u: ", i); seq_file_path(m, f, " \t\n\\"); seq_puts(m, "\n"); } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); for (i = 0; i < ctx->buf_table.nr; i++) { struct io_mapped_ubuf *buf = NULL; if (ctx->buf_table.nodes[i]) buf = ctx->buf_table.nodes[i]->buf; if (buf) seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); else seq_printf(m, "%5u: <none>\n", i); } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; hlist_for_each_entry(req, &hb->list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, task_work_pending(req->tctx->task)); } seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { struct io_uring_cqe *cqe = &ocqe->cqe; seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", cqe->user_data, cqe->res, cqe->flags); } spin_unlock(&ctx->completion_lock); napi_show_fdinfo(ctx, m); } /* * Caller holds a reference to the file already, we don't need to do * anything else to get an extra reference. */ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) { struct io_ring_ctx *ctx = file->private_data; /* * Avoid ABBA deadlock between the seq lock and the io_uring mutex, * since fdinfo case grabs it in the opposite direction of normal use * cases. */ if (mutex_trylock(&ctx->uring_lock)) { __io_uring_show_fdinfo(ctx, m); mutex_unlock(&ctx->uring_lock); } }
168 163 163 5 153 153 153 153 153 153 153 153 153 1 1 1 1 1 1 1 6 10 5 19 1 1 4 1 3 1 1 4 3 4 4 4 4 4 3 3 3 2 5 5 3 3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #include <net/sock.h> #include <trace/events/devlink.h> #include "devl_internal.h" struct devlink_fmsg_item { struct list_head list; int attrtype; u8 nla_type; u16 len; int value[]; }; struct devlink_fmsg { struct list_head item_list; int err; /* first error encountered on some devlink_fmsg_XXX() call */ bool putting_binary; /* This flag forces enclosing of binary data * in an array brackets. It forces using * of designated API: * devlink_fmsg_binary_pair_nest_start() * devlink_fmsg_binary_pair_nest_end() */ }; static struct devlink_fmsg *devlink_fmsg_alloc(void) { struct devlink_fmsg *fmsg; fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); if (!fmsg) return NULL; INIT_LIST_HEAD(&fmsg->item_list); return fmsg; } static void devlink_fmsg_free(struct devlink_fmsg *fmsg) { struct devlink_fmsg_item *item, *tmp; list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { list_del(&item->list); kfree(item); } kfree(fmsg); } struct devlink_health_reporter { struct list_head list; void *priv; const struct devlink_health_reporter_ops *ops; struct devlink *devlink; struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; bool auto_recover; bool auto_dump; u8 health_state; u64 dump_ts; u64 dump_real_ts; u64 error_count; u64 recovery_count; u64 last_recovery_ts; }; void * devlink_health_reporter_priv(struct devlink_health_reporter *reporter) { return reporter->priv; } EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); static struct devlink_health_reporter * __devlink_health_reporter_find_by_name(struct list_head *reporter_list, const char *reporter_name) { struct devlink_health_reporter *reporter; list_for_each_entry(reporter, reporter_list, list) if (!strcmp(reporter->ops->name, reporter_name)) return reporter; return NULL; } static struct devlink_health_reporter * devlink_health_reporter_find_by_name(struct devlink *devlink, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink->reporter_list, reporter_name); } static struct devlink_health_reporter * devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, const char *reporter_name) { return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list, reporter_name); } static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; if (WARN_ON(graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); if (!reporter) return ERR_PTR(-ENOMEM); reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = graceful_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; } /** * devl_port_health_reporter_create() - create devlink health reporter for * specified port instance * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_assert_locked(port->devlink); if (__devlink_health_reporter_find_by_name(&port->reporter_list, ops->name)) return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(port->devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) return reporter; reporter->devlink_port = port; list_add_tail(&reporter->list, &port->reporter_list); return reporter; } EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); reporter = devl_port_health_reporter_create(port, ops, graceful_period, priv); devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); /** * devl_health_reporter_create - create devlink health reporter * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_assert_locked(devlink); if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); reporter = __devlink_health_reporter_create(devlink, ops, graceful_period, priv); if (IS_ERR(reporter)) return reporter; list_add_tail(&reporter->list, &devlink->reporter_list); return reporter; } EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); reporter = devl_health_reporter_create(devlink, ops, graceful_period, priv); devl_unlock(devlink); return reporter; } EXPORT_SYMBOL_GPL(devlink_health_reporter_create); static void devlink_health_reporter_free(struct devlink_health_reporter *reporter) { if (reporter->dump_fmsg) devlink_fmsg_free(reporter->dump_fmsg); kfree(reporter); } /** * devl_health_reporter_destroy() - destroy devlink health reporter * * @reporter: devlink health reporter to destroy */ void devl_health_reporter_destroy(struct devlink_health_reporter *reporter) { devl_assert_locked(reporter->devlink); list_del(&reporter->list); devlink_health_reporter_free(reporter); } EXPORT_SYMBOL_GPL(devl_health_reporter_destroy); void devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) { struct devlink *devlink = reporter->devlink; devl_lock(devlink); devl_health_reporter_destroy(reporter); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, struct devlink_health_reporter *reporter, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct devlink *devlink = reporter->devlink; struct nlattr *reporter_attr; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; if (reporter->devlink_port) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index)) goto genlmsg_cancel; } reporter_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_HEALTH_REPORTER); if (!reporter_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, reporter->ops->name)) goto reporter_nest_cancel; if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, reporter->health_state)) goto reporter_nest_cancel; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, reporter->error_count)) goto reporter_nest_cancel; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, reporter->recovery_count)) goto reporter_nest_cancel; if (reporter->ops->recover && devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period)) goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) goto reporter_nest_cancel; if (reporter->dump_fmsg && devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, jiffies_to_msecs(reporter->dump_ts))) goto reporter_nest_cancel; if (reporter->dump_fmsg && devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, reporter->dump_real_ts)) goto reporter_nest_cancel; if (reporter->ops->dump && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, reporter->auto_dump)) goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); return 0; reporter_nest_cancel: nla_nest_cancel(msg, reporter_attr); genlmsg_cancel: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct devlink_health_reporter * devlink_health_reporter_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { struct devlink_port *devlink_port; char *reporter_name; if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) return NULL; reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); devlink_port = devlink_port_get_from_attrs(devlink, attrs); if (IS_ERR(devlink_port)) return devlink_health_reporter_find_by_name(devlink, reporter_name); else return devlink_port_health_reporter_find_by_name(devlink_port, reporter_name); } static struct devlink_health_reporter * devlink_health_reporter_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_health_reporter_get_from_attrs(devlink, info->attrs); } int devlink_nl_health_reporter_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; struct sk_buff *msg; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; unsigned long port_index_end = ULONG_MAX; struct nlattr **attrs = info->attrs; unsigned long port_index_start = 0; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; if (attrs && attrs[DEVLINK_ATTR_PORT_INDEX]) { port_index_start = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port_index_end = port_index_start; flags |= NLM_F_DUMP_FILTERED; goto per_port_dump; } list_for_each_entry(reporter, &devlink->reporter_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; return err; } idx++; } per_port_dump: xa_for_each_range(&devlink->ports, port_index, port, port_index_start, port_index_end) { list_for_each_entry(reporter, &port->reporter_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; return err; } idx++; } } return 0; } int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_health_reporter_get_dump_one); } int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) reporter->auto_dump = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); return 0; } static void devlink_recover_notify(struct devlink_health_reporter *reporter, enum devlink_command cmd) { struct devlink *devlink = reporter->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); ASSERT_DEVLINK_REGISTERED(devlink); if (!devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); if (reporter->devlink_port) devlink_nl_obj_desc_port_set(&desc, reporter->devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) { int err; if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) return 0; if (!reporter->ops->recover) return -EOPNOTSUPP; err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; devlink_health_reporter_recovery_done(reporter); reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); return 0; } static void devlink_health_dump_clear(struct devlink_health_reporter *reporter) { if (!reporter->dump_fmsg) return; devlink_fmsg_free(reporter->dump_fmsg); reporter->dump_fmsg = NULL; } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, void *priv_ctx, struct netlink_ext_ack *extack) { int err; if (!reporter->ops->dump) return 0; if (reporter->dump_fmsg) return 0; reporter->dump_fmsg = devlink_fmsg_alloc(); if (!reporter->dump_fmsg) return -ENOMEM; devlink_fmsg_obj_nest_start(reporter->dump_fmsg); err = reporter->ops->dump(reporter, reporter->dump_fmsg, priv_ctx, extack); if (err) goto dump_err; devlink_fmsg_obj_nest_end(reporter->dump_fmsg); err = reporter->dump_fmsg->err; if (err) goto dump_err; reporter->dump_ts = jiffies; reporter->dump_real_ts = ktime_get_real_ns(); return 0; dump_err: devlink_health_dump_clear(reporter); return err; } int devlink_health_report(struct devlink_health_reporter *reporter, const char *msg, void *priv_ctx) { enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; unsigned long recover_ts_threshold; int ret; /* write a log message of the current error */ WARN_ON(!msg); trace_devlink_health_report(devlink, reporter->ops->name, msg); reporter->error_count++; prev_health_state = reporter->health_state; reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); /* abort if the previous error wasn't recovered */ recover_ts_threshold = reporter->last_recovery_ts + msecs_to_jiffies(reporter->graceful_period); if (reporter->auto_recover && (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || (reporter->last_recovery_ts && reporter->recovery_count && time_is_after_jiffies(recover_ts_threshold)))) { trace_devlink_health_recover_aborted(devlink, reporter->ops->name, reporter->health_state, jiffies - reporter->last_recovery_ts); return -ECANCELED; } if (reporter->auto_dump) { devl_lock(devlink); /* store current dump of current error, for later analysis */ devlink_health_do_dump(reporter, priv_ctx, NULL); devl_unlock(devlink); } if (!reporter->auto_recover) return 0; devl_lock(devlink); ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL); devl_unlock(devlink); return ret; } EXPORT_SYMBOL_GPL(devlink_health_report); void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state) { if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) return; if (reporter->health_state == state) return; reporter->health_state = state; trace_devlink_health_reporter_state_update(reporter->devlink, reporter->ops->name, state); devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); } EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); int devlink_nl_health_reporter_recover_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; return devlink_health_reporter_recover(reporter, NULL, info->extack); } static void devlink_fmsg_err_if_binary(struct devlink_fmsg *fmsg) { if (!fmsg->err && fmsg->putting_binary) fmsg->err = -EINVAL; } static void devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype) { struct devlink_fmsg_item *item; if (fmsg->err) return; item = kzalloc(sizeof(*item), GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->attrtype = attrtype; list_add_tail(&item->list, &fmsg->item_list); } void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); static void devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); } void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); #define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) { struct devlink_fmsg_item *item; devlink_fmsg_err_if_binary(fmsg); if (fmsg->err) return; if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) { fmsg->err = -EMSGSIZE; return; } item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->nla_type = DEVLINK_VAR_ATTR_TYPE_NUL_STRING; item->len = strlen(name) + 1; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; memcpy(&item->value, name, item->len); list_add_tail(&item->list, &fmsg->item_list); } void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); devlink_fmsg_put_name(fmsg, name); } EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); } EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) { devlink_fmsg_nest_end(fmsg); devlink_fmsg_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) { devlink_fmsg_arr_pair_nest_start(fmsg, name); fmsg->putting_binary = true; } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start); void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg) { if (fmsg->err) return; if (!fmsg->putting_binary) fmsg->err = -EINVAL; fmsg->putting_binary = false; devlink_fmsg_arr_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end); static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg, const void *value, u16 value_len, u8 value_nla_type) { struct devlink_fmsg_item *item; if (fmsg->err) return; if (value_len > DEVLINK_FMSG_MAX_SIZE) { fmsg->err = -EMSGSIZE; return; } item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); if (!item) { fmsg->err = -ENOMEM; return; } item->nla_type = value_nla_type; item->len = value_len; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; memcpy(&item->value, value, item->len); list_add_tail(&item->list, &fmsg->item_list); } static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), DEVLINK_VAR_ATTR_TYPE_FLAG); } static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), DEVLINK_VAR_ATTR_TYPE_U8); } void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), DEVLINK_VAR_ATTR_TYPE_U32); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, &value, sizeof(value), DEVLINK_VAR_ATTR_TYPE_U64); } void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { devlink_fmsg_err_if_binary(fmsg); devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, DEVLINK_VAR_ATTR_TYPE_NUL_STRING); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, u16 value_len) { if (!fmsg->err && !fmsg->putting_binary) fmsg->err = -EINVAL; devlink_fmsg_put_value(fmsg, value, value_len, DEVLINK_VAR_ATTR_TYPE_BINARY); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_bool_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, u8 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u8_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, u32 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u32_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, u64 value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_u64_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, const char *value) { devlink_fmsg_pair_nest_start(fmsg, name); devlink_fmsg_string_put(fmsg, value); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, const void *value, u32 value_len) { u32 data_size; u32 offset; devlink_fmsg_binary_pair_nest_start(fmsg, name); for (offset = 0; offset < value_len; offset += data_size) { data_size = value_len - offset; if (data_size > DEVLINK_FMSG_MAX_SIZE) data_size = DEVLINK_FMSG_MAX_SIZE; devlink_fmsg_binary_put(fmsg, value + offset, data_size); } devlink_fmsg_binary_pair_nest_end(fmsg); fmsg->putting_binary = false; } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); static int devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) { int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; u8 tmp; switch (msg->nla_type) { case DEVLINK_VAR_ATTR_TYPE_FLAG: /* Always provide flag data, regardless of its value */ tmp = *(bool *)msg->value; return nla_put_u8(skb, attrtype, tmp); case DEVLINK_VAR_ATTR_TYPE_U8: return nla_put_u8(skb, attrtype, *(u8 *)msg->value); case DEVLINK_VAR_ATTR_TYPE_U32: return nla_put_u32(skb, attrtype, *(u32 *)msg->value); case DEVLINK_VAR_ATTR_TYPE_U64: return devlink_nl_put_u64(skb, attrtype, *(u64 *)msg->value); case DEVLINK_VAR_ATTR_TYPE_NUL_STRING: return nla_put_string(skb, attrtype, (char *)&msg->value); case DEVLINK_VAR_ATTR_TYPE_BINARY: return nla_put(skb, attrtype, msg->len, (void *)&msg->value); default: return -EINVAL; } } static int devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, int *start) { struct devlink_fmsg_item *item; struct nlattr *fmsg_nlattr; int err = 0; int i = 0; fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG); if (!fmsg_nlattr) return -EMSGSIZE; list_for_each_entry(item, &fmsg->item_list, list) { if (i < *start) { i++; continue; } switch (item->attrtype) { case DEVLINK_ATTR_FMSG_OBJ_NEST_START: case DEVLINK_ATTR_FMSG_PAIR_NEST_START: case DEVLINK_ATTR_FMSG_ARR_NEST_START: case DEVLINK_ATTR_FMSG_NEST_END: err = nla_put_flag(skb, item->attrtype); break; case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: err = nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, item->nla_type); if (err) break; err = devlink_fmsg_item_fill_data(item, skb); break; case DEVLINK_ATTR_FMSG_OBJ_NAME: err = nla_put_string(skb, item->attrtype, (char *)&item->value); break; default: err = -EINVAL; break; } if (!err) *start = ++i; else break; } nla_nest_end(skb, fmsg_nlattr); return err; } static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, struct genl_info *info, enum devlink_command cmd, int flags) { struct nlmsghdr *nlh; struct sk_buff *skb; bool last = false; int index = 0; void *hdr; int err; if (fmsg->err) return fmsg->err; while (!last) { int tmp_index = index; skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, flags | NLM_F_MULTI, cmd); if (!hdr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_fmsg_prepare_skb(fmsg, skb, &index); if (!err) last = true; else if (err != -EMSGSIZE || tmp_index == index) goto nla_put_failure; genlmsg_end(skb, hdr); err = genlmsg_reply(skb, info); if (err) return err; } skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = -EMSGSIZE; goto nla_put_failure; } return genlmsg_reply(skb, info); nla_put_failure: nlmsg_free(skb); return err; } static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb, struct netlink_callback *cb, enum devlink_command cmd) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); int index = state->idx; int tmp_index = index; void *hdr; int err; if (fmsg->err) return fmsg->err; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd); if (!hdr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_fmsg_prepare_skb(fmsg, skb, &index); if ((err && err != -EMSGSIZE) || tmp_index == index) goto nla_put_failure; state->idx = index; genlmsg_end(skb, hdr); return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); return err; } int devlink_nl_health_reporter_diagnose_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; struct devlink_fmsg *fmsg; int err; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->diagnose) return -EOPNOTSUPP; fmsg = devlink_fmsg_alloc(); if (!fmsg) return -ENOMEM; devlink_fmsg_obj_nest_start(fmsg); err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; devlink_fmsg_obj_nest_end(fmsg); err = devlink_fmsg_snd(fmsg, info, DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); out: devlink_fmsg_free(fmsg); return err; } static struct devlink_health_reporter * devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct devlink_health_reporter *reporter; struct nlattr **attrs = info->attrs; struct devlink *devlink; devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, false); if (IS_ERR(devlink)) return NULL; reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); if (!reporter) { devl_unlock(devlink); devlink_put(devlink); } return reporter; } int devlink_nl_health_reporter_dump_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_health_reporter *reporter; struct devlink *devlink; int err; reporter = devlink_health_reporter_get_from_cb_lock(cb); if (!reporter) return -EINVAL; devlink = reporter->devlink; if (!reporter->ops->dump) { devl_unlock(devlink); devlink_put(devlink); return -EOPNOTSUPP; } if (!state->idx) { err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; state->dump_ts = reporter->dump_ts; } if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) { NL_SET_ERR_MSG(cb->extack, "Dump trampled, please retry"); err = -EAGAIN; goto unlock; } err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb, DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET); unlock: devl_unlock(devlink); devlink_put(devlink); return err; } int devlink_nl_health_reporter_dump_clear_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->dump) return -EOPNOTSUPP; devlink_health_dump_clear(reporter); return 0; } int devlink_nl_health_reporter_test_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_health_reporter *reporter; reporter = devlink_health_reporter_get_from_info(devlink, info); if (!reporter) return -EINVAL; if (!reporter->ops->test) return -EOPNOTSUPP; return reporter->ops->test(reporter, info->extack); } /** * devlink_fmsg_dump_skb - Dump sk_buffer structure * @fmsg: devlink formatted message pointer * @skb: pointer to skb * * Dump diagnostic information about sk_buff structure, like headroom, length, * tailroom, MAC, etc. */ void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb) { struct skb_shared_info *sh = skb_shinfo(skb); struct sock *sk = skb->sk; bool has_mac, has_trans; has_mac = skb_mac_header_was_set(skb); has_trans = skb_transport_header_was_set(skb); devlink_fmsg_pair_nest_start(fmsg, "skb"); devlink_fmsg_obj_nest_start(fmsg); devlink_fmsg_put(fmsg, "actual len", skb->len); devlink_fmsg_put(fmsg, "head len", skb_headlen(skb)); devlink_fmsg_put(fmsg, "data len", skb->data_len); devlink_fmsg_put(fmsg, "tail len", skb_tailroom(skb)); devlink_fmsg_put(fmsg, "MAC", has_mac ? skb->mac_header : -1); devlink_fmsg_put(fmsg, "MAC len", has_mac ? skb_mac_header_len(skb) : -1); devlink_fmsg_put(fmsg, "network hdr", skb->network_header); devlink_fmsg_put(fmsg, "network hdr len", has_trans ? skb_network_header_len(skb) : -1); devlink_fmsg_put(fmsg, "transport hdr", has_trans ? skb->transport_header : -1); devlink_fmsg_put(fmsg, "csum", (__force u32)skb->csum); devlink_fmsg_put(fmsg, "csum_ip_summed", (u8)skb->ip_summed); devlink_fmsg_put(fmsg, "csum_complete_sw", !!skb->csum_complete_sw); devlink_fmsg_put(fmsg, "csum_valid", !!skb->csum_valid); devlink_fmsg_put(fmsg, "csum_level", (u8)skb->csum_level); devlink_fmsg_put(fmsg, "sw_hash", !!skb->sw_hash); devlink_fmsg_put(fmsg, "l4_hash", !!skb->l4_hash); devlink_fmsg_put(fmsg, "proto", ntohs(skb->protocol)); devlink_fmsg_put(fmsg, "pkt_type", (u8)skb->pkt_type); devlink_fmsg_put(fmsg, "iif", skb->skb_iif); if (sk) { devlink_fmsg_pair_nest_start(fmsg, "sk"); devlink_fmsg_obj_nest_start(fmsg); devlink_fmsg_put(fmsg, "family", sk->sk_type); devlink_fmsg_put(fmsg, "type", sk->sk_type); devlink_fmsg_put(fmsg, "proto", sk->sk_protocol); devlink_fmsg_obj_nest_end(fmsg); devlink_fmsg_pair_nest_end(fmsg); } devlink_fmsg_obj_nest_end(fmsg); devlink_fmsg_pair_nest_end(fmsg); devlink_fmsg_pair_nest_start(fmsg, "shinfo"); devlink_fmsg_obj_nest_start(fmsg); devlink_fmsg_put(fmsg, "tx_flags", sh->tx_flags); devlink_fmsg_put(fmsg, "nr_frags", sh->nr_frags); devlink_fmsg_put(fmsg, "gso_size", sh->gso_size); devlink_fmsg_put(fmsg, "gso_type", sh->gso_type); devlink_fmsg_put(fmsg, "gso_segs", sh->gso_segs); devlink_fmsg_obj_nest_end(fmsg); devlink_fmsg_pair_nest_end(fmsg); } EXPORT_SYMBOL_GPL(devlink_fmsg_dump_skb);
4 2 3 4 3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> */ #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> static unsigned int netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; struct nf_nat_range2 newrange; struct nf_conn *ct; enum ip_conntrack_info ctinfo; union nf_inet_addr new_addr, netmask; unsigned int i; ct = nf_ct_get(skb, &ctinfo); for (i = 0; i < ARRAY_SIZE(range->min_addr.ip6); i++) netmask.ip6[i] = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); if (xt_hooknum(par) == NF_INET_PRE_ROUTING || xt_hooknum(par) == NF_INET_LOCAL_OUT) new_addr.in6 = ipv6_hdr(skb)->daddr; else new_addr.in6 = ipv6_hdr(skb)->saddr; for (i = 0; i < ARRAY_SIZE(new_addr.ip6); i++) { new_addr.ip6[i] &= ~netmask.ip6[i]; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask.ip6[i]; } newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr = new_addr; newrange.max_addr = new_addr; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); } static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_range2 *range = par->targinfo; if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return -EINVAL; return nf_ct_netns_get(par->net, par->family); } static void netmap_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static unsigned int netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 newrange; WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING && xt_hooknum(par) != NF_INET_POST_ROUTING && xt_hooknum(par) != NF_INET_LOCAL_OUT && xt_hooknum(par) != NF_INET_LOCAL_IN); ct = nf_ct_get(skb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); if (xt_hooknum(par) == NF_INET_PRE_ROUTING || xt_hooknum(par) == NF_INET_LOCAL_OUT) new_ip = ip_hdr(skb)->daddr & ~netmask; else new_ip = ip_hdr(skb)->saddr & ~netmask; new_ip |= mr->range[0].min_ip & netmask; memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = new_ip; newrange.max_addr.ip = new_ip; newrange.min_proto = mr->range[0].min; newrange.max_proto = mr->range[0].max; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par))); } static int netmap_tg4_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (!(mr->range[0].flags & NF_NAT_RANGE_MAP_IPS)) { pr_debug("bad MAP_IPS.\n"); return -EINVAL; } if (mr->rangesize != 1) { pr_debug("bad rangesize %u.\n", mr->rangesize); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static struct xt_target netmap_tg_reg[] __read_mostly = { { .name = "NETMAP", .family = NFPROTO_IPV6, .revision = 0, .target = netmap_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .checkentry = netmap_tg6_checkentry, .destroy = netmap_tg_destroy, .me = THIS_MODULE, }, { .name = "NETMAP", .family = NFPROTO_IPV4, .revision = 0, .target = netmap_tg4, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), .checkentry = netmap_tg4_check, .destroy = netmap_tg_destroy, .me = THIS_MODULE, }, }; static int __init netmap_tg_init(void) { return xt_register_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); } static void netmap_tg_exit(void) { xt_unregister_targets(netmap_tg_reg, ARRAY_SIZE(netmap_tg_reg)); } module_init(netmap_tg_init); module_exit(netmap_tg_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of subnets"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ip6t_NETMAP"); MODULE_ALIAS("ipt_NETMAP");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-or-later /* Paravirtualization interfaces Copyright (C) 2006 Rusty Russell IBM Corporation 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ #include <linux/errno.h> #include <linux/init.h> #include <linux/export.h> #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> #include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> #include <asm/delay.h> #include <asm/fixmap.h> #include <asm/apic.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> #include <asm/msr.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } #ifdef CONFIG_PARAVIRT_XXL DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_enable(&virt_spin_lock_key); } struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { return 0; } DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)) { static_call_update(pv_sched_clock, func); } static noinstr void pv_native_safe_halt(void) { native_safe_halt(); } #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); } static noinstr unsigned long pv_native_read_cr3(void) { return __native_read_cr3(); } static noinstr void pv_native_write_cr3(unsigned long cr3) { native_write_cr3(cr3); } static noinstr unsigned long pv_native_get_debugreg(int regno) { return native_get_debugreg(regno); } static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } #endif struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL .extra_user_64bit_cs = __USER_CS, #endif }; /* 64-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { /* Cpu ops. */ .cpu.io_delay = native_io_delay, #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, .cpu.get_debugreg = pv_native_get_debugreg, .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, .cpu.write_msr_safe = native_write_msr_safe, .cpu.read_pmc = native_read_pmc, .cpu.load_tr_desc = native_load_tr_desc, .cpu.set_ldt = native_set_ldt, .cpu.load_gdt = native_load_gdt, .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, .cpu.load_gs_index = native_load_gs_index, .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, .cpu.alloc_ldt = paravirt_nop, .cpu.free_ldt = paravirt_nop, .cpu.load_sp0 = native_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), #endif /* CONFIG_PARAVIRT_XXL */ /* Irq HLT ops. */ .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = pv_native_read_cr3, .mmu.write_cr3 = pv_native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, .mmu.alloc_pte = paravirt_nop, .mmu.alloc_pmd = paravirt_nop, .mmu.alloc_pud = paravirt_nop, .mmu.alloc_p4d = paravirt_nop, .mmu.release_pte = paravirt_nop, .mmu.release_pmd = paravirt_nop, .mmu.release_pud = paravirt_nop, .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, .mmu.set_p4d = native_set_p4d, .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, .mmu.make_pte = PTE_IDENT, .mmu.make_pgd = PTE_IDENT, .mmu.enter_mmap = paravirt_nop, .mmu.lazy_mode = { .enter = paravirt_nop, .leave = paravirt_nop, .flush = paravirt_nop, }, .mmu.set_fixmap = native_set_fixmap, #endif /* CONFIG_PARAVIRT_XXL */ #if defined(CONFIG_PARAVIRT_SPINLOCKS) /* Lock ops. */ #ifdef CONFIG_SMP .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .lock.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .lock.wait = paravirt_nop, .lock.kick = paravirt_nop, .lock.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ #endif }; #ifdef CONFIG_PARAVIRT_XXL NOKPROBE_SYMBOL(native_load_idt); #endif EXPORT_SYMBOL(pv_ops); EXPORT_SYMBOL_GPL(pv_info);
12 9 11 11 10 10 10 5 4 2 2 2 1 10 10 9 9 9 1 3 4 7 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/inet_sock.h> #include <net/raw.h> #include <net/rawv6.h> #ifdef pr_fmt # undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt static struct raw_hashinfo * raw_get_hashinfo(const struct inet_diag_req_v2 *r) { if (r->sdiag_family == AF_INET) { return &raw_v4_hashinfo; #if IS_ENABLED(CONFIG_IPV6) } else if (r->sdiag_family == AF_INET6) { return &raw_v6_hashinfo; #endif } else { return ERR_PTR(-EINVAL); } } /* * Due to requirement of not breaking user API we can't simply * rename @pad field in inet_diag_req_v2 structure, instead * use helper to figure it out. */ static bool raw_lookup(struct net *net, const struct sock *sk, const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; if (r->sdiag_family == AF_INET) return raw_v4_match(net, sk, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else return raw_v6_match(net, sk, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, r->id.idiag_if, 0); #endif return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct hlist_head *hlist; struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill * diag message to be reported, so * caller should call sock_put then. */ if (refcount_inc_not_zero(&sk->sk_refcnt)) goto out_unlock; } } } sk = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; struct net *net; int err; net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); return -ENOMEM; } err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); if (err < 0) { kfree_skb(rep); return err; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); return err; } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; struct nlattr *bc; if (IS_ERR(hashinfo)) return; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) goto out_unlock; next: num++; } } out_unlock: rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; } static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int raw_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *r) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } #endif static const struct inet_diag_handler raw_diag_handler = { .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, .idiag_type = IPPROTO_RAW, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = raw_diag_destroy, #endif }; static void __always_unused __check_inet_diag_req_raw(void) { /* * Make sure the two structures are identical, * except the @pad field. */ #define __offset_mismatch(m1, m2) \ (offsetof(struct inet_diag_req_v2, m1) != \ offsetof(struct inet_diag_req_raw, m2)) BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != sizeof(struct inet_diag_req_raw)); BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); BUILD_BUG_ON(__offset_mismatch(id, id)); #undef __offset_mismatch } static int __init raw_diag_init(void) { return inet_diag_register(&raw_diag_handler); } static void __exit raw_diag_exit(void) { inet_diag_unregister(&raw_diag_handler); } module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
7 7 4 3 9 4 2 10 4 14 8 7 6 7 11 6 3 11 3 11 14 14 17 17 17 2 2 14 1 1 1 1 20 1 17 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 // SPDX-License-Identifier: GPL-2.0-only #include <linux/dcache.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/sock_diag.h> #include <linux/types.h> #include <linux/user_namespace.h> #include <net/af_unix.h> #include <net/netlink.h> #include <net/tcp_states.h> #include <uapi/linux/unix_diag.h> #include "af_unix.h" static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { /* might or might not have a hash table lock */ struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) return 0; return nla_put(nlskb, UNIX_DIAG_NAME, addr->len - offsetof(struct sockaddr_un, sun_path), addr->name->sun_path); } static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) { struct dentry *dentry = unix_sk(sk)->path.dentry; if (dentry) { struct unix_diag_vfs uv = { .udiag_vfs_ino = d_backing_inode(dentry)->i_ino, .udiag_vfs_dev = dentry->d_sb->s_dev, }; return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv); } return 0; } static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb) { struct sock *peer; int ino; peer = unix_peer_get(sk); if (peer) { ino = sock_i_ino(peer); sock_put(peer); return nla_put_u32(nlskb, UNIX_DIAG_PEER, ino); } return 0; } static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb) { struct sk_buff *skb; struct nlattr *attr; u32 *buf; int i; if (READ_ONCE(sk->sk_state) == TCP_LISTEN) { spin_lock(&sk->sk_receive_queue.lock); attr = nla_reserve(nlskb, UNIX_DIAG_ICONS, sk->sk_receive_queue.qlen * sizeof(u32)); if (!attr) goto errout; buf = nla_data(attr); i = 0; skb_queue_walk(&sk->sk_receive_queue, skb) buf[i++] = sock_i_ino(unix_peer(skb->sk)); spin_unlock(&sk->sk_receive_queue.lock); } return 0; errout: spin_unlock(&sk->sk_receive_queue.lock); return -EMSGSIZE; } static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) { struct unix_diag_rqlen rql; if (READ_ONCE(sk->sk_state) == TCP_LISTEN) { rql.udiag_rqueue = skb_queue_len_lockless(&sk->sk_receive_queue); rql.udiag_wqueue = sk->sk_max_ack_backlog; } else { rql.udiag_rqueue = (u32) unix_inq_len(sk); rql.udiag_wqueue = (u32) unix_outq_len(sk); } return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb, struct user_namespace *user_ns) { uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk)); return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, struct user_namespace *user_ns, u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct unix_diag_msg *rep; nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->udiag_family = AF_UNIX; rep->udiag_type = sk->sk_type; rep->udiag_state = READ_ONCE(sk->sk_state); rep->pad = 0; rep->udiag_ino = sk_ino; sock_diag_save_cookie(sk, rep->udiag_cookie); if ((req->udiag_show & UDIAG_SHOW_NAME) && sk_diag_dump_name(sk, skb)) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_VFS) && sk_diag_dump_vfs(sk, skb)) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_PEER) && sk_diag_dump_peer(sk, skb)) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_ICONS) && sk_diag_dump_icons(sk, skb)) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_RQLEN) && sk_diag_show_rqlen(sk, skb)) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO)) goto out_nlmsg_trim; if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, READ_ONCE(sk->sk_shutdown))) goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_UID) && sk_diag_dump_uid(sk, skb, user_ns)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; struct unix_diag_req *req; req = nlmsg_data(cb->nlh); s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot < UNIX_HASH_SIZE; s_num = 0, slot++) { struct sock *sk; num = 0; spin_lock(&net->unx.table.locks[slot]); sk_for_each(sk, &net->unx.table.buckets[slot]) { int sk_ino; if (num < s_num) goto next; if (!(req->udiag_states & (1 << READ_ONCE(sk->sk_state)))) goto next; sk_ino = sock_i_ino(sk); if (!sk_ino) goto next; if (sk_diag_fill(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sk_ino) < 0) { spin_unlock(&net->unx.table.locks[slot]); goto done; } next: num++; } spin_unlock(&net->unx.table.locks[slot]); } done: cb->args[0] = slot; cb->args[1] = num; return skb->len; } static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino) { struct sock *sk; int i; for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock(&net->unx.table.locks[i]); sk_for_each(sk, &net->unx.table.buckets[i]) { if (ino == sock_i_ino(sk)) { sock_hold(sk); spin_unlock(&net->unx.table.locks[i]); return sk; } } spin_unlock(&net->unx.table.locks[i]); } return NULL; } static int unix_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh, struct unix_diag_req *req) { struct net *net = sock_net(in_skb->sk); unsigned int extra_len; struct sk_buff *rep; struct sock *sk; int err; err = -EINVAL; if (req->udiag_ino == 0) goto out_nosk; sk = unix_lookup_by_ino(net, req->udiag_ino); err = -ENOENT; if (sk == NULL) goto out_nosk; err = sock_diag_check_cookie(sk, req->udiag_cookie); if (err) goto out; extra_len = 256; again: err = -ENOMEM; rep = nlmsg_new(sizeof(struct unix_diag_msg) + extra_len, GFP_KERNEL); if (!rep) goto out; err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, req->udiag_ino); if (err < 0) { nlmsg_free(rep); extra_len += 256; if (extra_len >= PAGE_SIZE) goto out; goto again; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct unix_diag_req); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = unix_diag_dump, }; return netlink_dump_start(sock_net(skb->sk)->diag_nlsk, skb, h, &c); } else return unix_diag_get_exact(skb, h, nlmsg_data(h)); } static const struct sock_diag_handler unix_diag_handler = { .owner = THIS_MODULE, .family = AF_UNIX, .dump = unix_diag_handler_dump, }; static int __init unix_diag_init(void) { return sock_diag_register(&unix_diag_handler); } static void __exit unix_diag_exit(void) { sock_diag_unregister(&unix_diag_handler); } module_init(unix_diag_init); module_exit(unix_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UNIX socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 1 /* AF_LOCAL */);
10 1 1 1 2 5 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * mpls tunnels An implementation mpls tunnels using the light weight tunnel * infrastructure * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/module.h> #include <linux/mpls.h> #include <linux/vmalloc.h> #include <net/ip.h> #include <net/dst.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/netns/generic.h> #include <net/ip6_fib.h> #include <net/route.h> #include <net/mpls_iptunnel.h> #include <linux/mpls_iptunnel.h> #include "internal.h" static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { [MPLS_IPTUNNEL_DST] = { .len = sizeof(u32) }, [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, }; static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) { /* The size of the layer 2.5 labels to be added for this route */ return en->labels * sizeof(struct mpls_shim_hdr); } static int mpls_xmit(struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; struct net_device *out_dev; unsigned int hh_len; unsigned int new_header_size; unsigned int mtu; struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; struct mpls_dev *out_mdev; struct net *net; int err = 0; bool bos; int i; unsigned int ttl; /* Find the output device */ out_dev = dst->dev; net = dev_net(out_dev); if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; skb_forward_csum(skb); tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); /* Obtain the ttl using the following set of rules. * * LWT ttl propagation setting: * - disabled => use default TTL value from LWT * - enabled => use TTL value from IPv4/IPv6 header * - default => * Global ttl propagation setting: * - disabled => use default TTL value from global setting * - enabled => use TTL value from IPv4/IPv6 header */ if (dst->ops->family == AF_INET) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ip_hdr(skb)->ttl; rt = dst_rtable(dst); } else if (dst->ops->family == AF_INET6) { if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) ttl = tun_encap_info->default_ttl; else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && !net->mpls.ip_ttl_propagate) ttl = net->mpls.default_ttl; else ttl = ipv6_hdr(skb)->hop_limit; rt6 = dst_rt6_info(dst); } else { goto drop; } /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) goto drop; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ if (skb_cow_head(skb, hh_len + new_header_size)) goto drop; skb_set_inner_protocol(skb, skb->protocol); skb_reset_inner_network_header(skb); skb_push(skb, new_header_size); skb_reset_network_header(skb); skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); /* Push the new labels */ hdr = mpls_hdr(skb); bos = true; for (i = tun_encap_info->labels - 1; i >= 0; i--) { hdr[i] = mpls_entry_encode(tun_encap_info->label[i], ttl, 0, bos); bos = false; } mpls_stats_inc_outucastpkts(out_dev, skb); if (rt) { if (rt->rt_gw_family == AF_INET6) err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt->rt_gw6, skb); else err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gw4, skb); } else if (rt6) { if (ipv6_addr_v4mapped(&rt6->rt6i_gateway)) { /* 6PE (RFC 4798) */ err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt6->rt6i_gateway.s6_addr32[3], skb); } else err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, skb); } if (err) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); return LWTUNNEL_XMIT_DONE; drop: out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); return -EINVAL; } static int mpls_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct mpls_iptunnel_encap *tun_encap_info; struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; u8 n_labels; int ret; ret = nla_parse_nested_deprecated(tb, MPLS_IPTUNNEL_MAX, nla, mpls_iptunnel_policy, extack); if (ret < 0) return ret; if (!tb[MPLS_IPTUNNEL_DST]) { NL_SET_ERR_MSG(extack, "MPLS_IPTUNNEL_DST attribute is missing"); return -EINVAL; } /* determine number of labels */ if (nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, &n_labels, NULL, extack)) return -EINVAL; newts = lwtunnel_state_alloc(struct_size(tun_encap_info, label, n_labels)); if (!newts) return -ENOMEM; tun_encap_info = mpls_lwtunnel_encap(newts); ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels, &tun_encap_info->labels, tun_encap_info->label, extack); if (ret) goto errout; tun_encap_info->ttl_propagate = MPLS_TTL_PROP_DEFAULT; if (tb[MPLS_IPTUNNEL_TTL]) { tun_encap_info->default_ttl = nla_get_u8(tb[MPLS_IPTUNNEL_TTL]); /* TTL 0 implies propagate from IP header */ tun_encap_info->ttl_propagate = tun_encap_info->default_ttl ? MPLS_TTL_PROP_DISABLED : MPLS_TTL_PROP_ENABLED; } newts->type = LWTUNNEL_ENCAP_MPLS; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; newts->headroom = mpls_encap_size(tun_encap_info); *ts = newts; return 0; errout: kfree(newts); *ts = NULL; return ret; } static int mpls_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; tun_encap_info = mpls_lwtunnel_encap(lwtstate); if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, tun_encap_info->label)) goto nla_put_failure; if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT && nla_put_u8(skb, MPLS_IPTUNNEL_TTL, tun_encap_info->default_ttl)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; int nlsize; tun_encap_info = mpls_lwtunnel_encap(lwtstate); nlsize = nla_total_size(tun_encap_info->labels * 4); if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT) nlsize += nla_total_size(1); return nlsize; } static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); int l; if (a_hdr->labels != b_hdr->labels || a_hdr->ttl_propagate != b_hdr->ttl_propagate || a_hdr->default_ttl != b_hdr->default_ttl) return 1; for (l = 0; l < a_hdr->labels; l++) if (a_hdr->label[l] != b_hdr->label[l]) return 1; return 0; } static const struct lwtunnel_encap_ops mpls_iptun_ops = { .build_state = mpls_build_state, .xmit = mpls_xmit, .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, .owner = THIS_MODULE, }; static int __init mpls_iptunnel_init(void) { return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_init(mpls_iptunnel_init); static void __exit mpls_iptunnel_exit(void) { lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); } module_exit(mpls_iptunnel_exit); MODULE_ALIAS_RTNL_LWT(MPLS); MODULE_SOFTDEP("post: mpls_gso"); MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); MODULE_LICENSE("GPL v2");
7 2 3 1 1 1 1 1 223 222 13 1 2 27 3 2 3 6 22 4 3 3 4 4 4 2 1 1 15 15 21 21 21 21 26 3 3 1 1 15 2 1 21 21 1 1 18 25 18 7 166 16 10 1 3 3 1 6 45 1 26 16 7 9 15 15 15 2 3 9 11 2 28 28 1 3 15 13 25 111 164 1 30 159 165 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); } static int xfrmi_create(struct net *net, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->p.link); } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; err = gro_cells_init(&xi->gro_cells, dev); if (err) return err; dev->lltx = true; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct xfrm_if_parms p = {}; struct xfrm_if *xi; struct net *net; int err; net = params->link_net ? : dev_net(dev); xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(net, dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; if (xi->p.collect_md) { NL_SET_ERR_MSG(extack, "device can't be changed to collect_md"); return -EINVAL; } } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->net); } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_net_dereference(net, *xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, dev_to_kill); } static struct pernet_operations xfrmi_net_ops = { .exit_rtnl = xfrmi_exit_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface");
1259 25 10670 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 /* SPDX-License-Identifier: GPL-2.0-only */ /* * include/linux/idr.h * * 2002-10-18 written by Jim Houston jim.houston@ccur.com * Copyright (C) 2002 by Concurrent Computer Corporation * * Small id to pointer translation service avoiding fixed sized * tables. */ #ifndef __IDR_H__ #define __IDR_H__ #include <linux/radix-tree.h> #include <linux/gfp.h> #include <linux/percpu.h> #include <linux/cleanup.h> struct idr { struct radix_tree_root idr_rt; unsigned int idr_base; unsigned int idr_next; }; /* * The IDR API does not expose the tagging functionality of the radix tree * to users. Use tag 0 to track whether a node has free space below it. */ #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ #define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ (1 << (ROOT_TAG_SHIFT + IDR_FREE))) #define IDR_INIT_BASE(name, base) { \ .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ #define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** * DEFINE_IDR() - Define a statically-allocated IDR. * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator * @idr: idr handle * * The value returned is the value that will be next returned from * idr_alloc_cyclic() if it is free (otherwise the search will start from * this position). */ static inline unsigned int idr_get_cursor(const struct idr *idr) { return READ_ONCE(idr->idr_next); } /** * idr_set_cursor - Set the current position of the cyclic allocator * @idr: idr handle * @val: new position * * The next call to idr_alloc_cyclic() will return @val if it is free * (otherwise the search will start from this position). */ static inline void idr_set_cursor(struct idr *idr, unsigned int val) { WRITE_ONCE(idr->idr_next, val); } /** * DOC: idr sync * idr synchronization (stolen from radix-tree.h) * * idr_find() is able to be called locklessly, using RCU. The caller must * ensure calls to this function are made within rcu_read_lock() regions. * Other readers (lock-free or otherwise) and modifications may be running * concurrently. * * It is still required that the caller manage the synchronization and * lifetimes of the items. So if RCU lock-free lookups are used, typically * this would mean that the items have their own locks, or are amenable to * lock-free access; and that the items are freed by RCU (or only freed after * having been deleted from the idr tree *and* a synchronize_rcu() grace * period). */ #define idr_lock(idr) xa_lock(&(idr)->idr_rt) #define idr_unlock(idr) xa_unlock(&(idr)->idr_rt) #define idr_lock_bh(idr) xa_lock_bh(&(idr)->idr_rt) #define idr_unlock_bh(idr) xa_unlock_bh(&(idr)->idr_rt) #define idr_lock_irq(idr) xa_lock_irq(&(idr)->idr_rt) #define idr_unlock_irq(idr) xa_unlock_irq(&(idr)->idr_rt) #define idr_lock_irqsave(idr, flags) \ xa_lock_irqsave(&(idr)->idr_rt, flags) #define idr_unlock_irqrestore(idr, flags) \ xa_unlock_irqrestore(&(idr)->idr_rt, flags) void idr_preload(gfp_t gfp_mask); int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t); int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id, unsigned long max, gfp_t); int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t); void *idr_remove(struct idr *, unsigned long id); void *idr_find(const struct idr *, unsigned long id); int idr_for_each(const struct idr *, int (*fn)(int id, void *p, void *data), void *data); void *idr_get_next(struct idr *, int *nextid); void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); struct __class_idr { struct idr *idr; int id; }; #define idr_null ((struct __class_idr){ NULL, -1 }) #define take_idr_id(id) __get_and_null(id, idr_null) DEFINE_CLASS(idr_alloc, struct __class_idr, if (_T.id >= 0) idr_remove(_T.idr, _T.id), ((struct __class_idr){ .idr = idr, .id = idr_alloc(idr, ptr, start, end, gfp), }), struct idr *idr, void *ptr, int start, int end, gfp_t gfp); /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. * @base: The base value for the IDR. * * This variation of idr_init() creates an IDR which will allocate IDs * starting at %base. */ static inline void idr_init_base(struct idr *idr, int base) { INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER); idr->idr_base = base; idr->idr_next = 0; } /** * idr_init() - Initialise an IDR. * @idr: IDR handle. * * Initialise a dynamically allocated IDR. To initialise a * statically allocated IDR, use DEFINE_IDR(). */ static inline void idr_init(struct idr *idr) { idr_init_base(idr, 0); } /** * idr_is_empty() - Are there any IDs allocated? * @idr: IDR handle. * * Return: %true if any IDs have been allocated from this IDR. */ static inline bool idr_is_empty(const struct idr *idr) { return radix_tree_empty(&idr->idr_rt) && radix_tree_tagged(&idr->idr_rt, IDR_FREE); } /** * idr_preload_end - end preload section started with idr_preload() * * Each idr_preload() should be matched with an invocation of this * function. See idr_preload() for details. */ static inline void idr_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } /** * idr_for_each_entry() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry(idr, entry, id) \ for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U) /** * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_ul(idr, entry, tmp, id) \ for (tmp = 0, id = 0; \ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /** * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. */ #define idr_for_each_entry_continue(idr, entry, id) \ for ((entry) = idr_get_next((idr), &(id)); \ entry; \ ++id, (entry) = idr_get_next((idr), &(id))) /** * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. * After normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \ for (tmp = id; \ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ tmp = id, ++id) /* * IDA - ID Allocator, use when translation from id to pointer isn't necessary. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) #define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8) struct ida_bitmap { unsigned long bitmap[IDA_BITMAP_LONGS]; }; struct ida { struct xarray xa; }; #define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC) #define IDA_INIT(name) { \ .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \ } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max); /** * ida_alloc() - Allocate an unused ID. * @ida: IDA handle. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc(struct ida *ida, gfp_t gfp) { return ida_alloc_range(ida, 0, ~0, gfp); } /** * ida_alloc_min() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) { return ida_alloc_range(ida, min, ~0, gfp); } /** * ida_alloc_max() - Allocate an unused ID. * @ida: IDA handle. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and @max, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) { return ida_alloc_range(ida, 0, max, gfp); } static inline void ida_init(struct ida *ida) { xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } /* * ida_simple_get() and ida_simple_remove() are deprecated. Use * ida_alloc() and ida_free() instead respectively. */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } static inline bool ida_exists(struct ida *ida, unsigned int id) { return ida_find_first_range(ida, id, id) == id; } static inline int ida_find_first(struct ida *ida) { return ida_find_first_range(ida, 0, ~0); } #endif /* __IDR_H__ */
498 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * smc_sysctl.c: sysctl interface to SMC subsystem. * * Copyright (c) 2022, Alibaba Inc. * * Author: Tony Lu <tonylu@linux.alibaba.com> * */ #include <linux/init.h> #include <linux/sysctl.h> #include <net/net_namespace.h> #include "smc.h" #include "smc_core.h" #include "smc_llc.h" #include "smc_sysctl.h" static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; static int max_sndbuf = INT_MAX / 2; static int max_rcvbuf = INT_MAX / 2; static const int net_smc_wmem_init = (64 * 1024); static const int net_smc_rmem_init = (64 * 1024); static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN; static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX; static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN; static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; static struct ctl_table smc_table[] = { { .procname = "autocorking_size", .data = &init_net.smc.sysctl_autocorking_size, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec, }, { .procname = "smcr_buf_type", .data = &init_net.smc.sysctl_smcr_buf_type, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "smcr_testlink_time", .data = &init_net.smc.sysctl_smcr_testlink_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "wmem", .data = &init_net.smc.sysctl_wmem, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, .extra2 = &max_sndbuf, }, { .procname = "rmem", .data = &init_net.smc.sysctl_rmem, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, .extra2 = &max_rcvbuf, }, { .procname = "smcr_max_links_per_lgr", .data = &init_net.smc.sysctl_max_links_per_lgr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &links_per_lgr_min, .extra2 = &links_per_lgr_max, }, { .procname = "smcr_max_conns_per_lgr", .data = &init_net.smc.sysctl_max_conns_per_lgr, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &conns_per_lgr_min, .extra2 = &conns_per_lgr_max, }, { .procname = "limit_smc_hs", .data = &init_net.smc.limit_smc_hs, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; int __net_init smc_sysctl_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(smc_table); struct ctl_table *table; table = smc_table; if (!net_eq(net, &init_net)) { int i; table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); if (!table) goto err_alloc; for (i = 0; i < table_size; i++) table[i].data += (void *)net - (void *)&init_net; } net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, table_size); if (!net->smc.smc_hdr) goto err_reg; net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init); WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; /* disable handshake limitation by default */ net->smc.limit_smc_hs = 0; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } void __net_exit smc_sysctl_net_exit(struct net *net) { const struct ctl_table *table; table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); if (!net_eq(net, &init_net)) kfree(table); }
71 71 467 469 4 755 535 536 532 537 537 536 7 534 535 537 769 468 469 468 1 4 464 757 769 3 13 25 1 1 1 1 1 11 10 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 771 146 770 716 717 765 45 63 63 737 5 121 767 770 758 758 736 717 47 183 191 191 7 3 3 3 3 771 114 148 3 143 130 139 17 71 115 9 123 772 726 201 1 200 772 772 771 772 4 112 4 112 111 112 111 4 8 112 8 9 111 111 4 3 4 112 112 89 89 111 37 69 86 111 112 23 23 89 4 26 89 4 89 4 89 4 89 4 31 31 112 24 89 89 31 69 33 1 12 93 4 31 34 89 112 4 4 4 4 3 4 3 89 31 4 10 112 9 112 99 23 10 10 112 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions implement the sctp_outq class. The outqueue handles * bundling and queueing of outgoing SCTP chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Perry Melange <pmelange@null.cc.uic.edu> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/slab.h> #include <net/sock.h> /* For skb_set_owner_w */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> #include <trace/events/sctp.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn); static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn, int count_of_newacks); static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { return q->sched->dequeue(q); } /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_out_ext *oute; __u16 stream; list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add_tail(&ch->stream_list, &oute->outq); } /* * SFR-CACC algorithm: * D) If count_of_newacks is greater than or equal to 2 * and t was not sent to the current primary then the * sender MUST NOT increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks >= 2 && transport != primary) return 1; return 0; } /* * SFR-CACC algorithm: * F) If count_of_newacks is less than 2, let d be the * destination to which t was sent. If cacc_saw_newack * is 0 for destination d, then the sender MUST NOT * increment missing report count for t. */ static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport, int count_of_newacks) { if (count_of_newacks < 2 && (transport && !transport->cacc.cacc_saw_newack)) return 1; return 0; } /* * SFR-CACC algorithm: * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD * execute steps C, D, F. * * C has been implemented in sctp_outq_sack */ static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks) { if (!primary->cacc.cycling_changeover) { if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks)) return 1; if (sctp_cacc_skip_3_1_f(transport, count_of_newacks)) return 1; return 0; } return 0; } /* * SFR-CACC algorithm: * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less * than next_tsn_at_change of the current primary, then * the sender MUST NOT increment missing report count * for t. */ static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn) { if (primary->cacc.cycling_changeover && TSN_lt(tsn, primary->cacc.next_tsn_at_change)) return 1; return 0; } /* * SFR-CACC algorithm: * 3) If the missing report count for TSN t is to be * incremented according to [RFC2960] and * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set, * then the sender MUST further execute steps 3.1 and * 3.2 to determine if the missing report count for * TSN t SHOULD NOT be incremented. * * 3.3) If 3.1 and 3.2 do not dictate that the missing * report count for t should not be incremented, then * the sender SHOULD increment missing report count for * t (according to [RFC2960] and [SCTP_STEWART_2002]). */ static inline int sctp_cacc_skip(struct sctp_transport *primary, struct sctp_transport *transport, int count_of_newacks, __u32 tsn) { if (primary->cacc.changeover_active && (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) || sctp_cacc_skip_3_2(primary, tsn))) return 1; return 0; } /* Initialize an existing sctp_outq. This does the boring stuff. * You still need to define handlers if you really want to DO * something with this structure... */ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) { memset(q, 0, sizeof(struct sctp_outq)); q->asoc = asoc; INIT_LIST_HEAD(&q->out_chunk_list); INIT_LIST_HEAD(&q->control_chunk_list); INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); sctp_sched_set_sched(asoc, sctp_sk(asoc->base.sk)->default_ss); } /* Free the outqueue structure and any related pending chunks. */ static void __sctp_outq_teardown(struct sctp_outq *q) { struct sctp_transport *transport; struct list_head *lchunk, *temp; struct sctp_chunk *chunk, *tmp; /* Throw away unacknowledged chunks. */ list_for_each_entry(transport, &q->asoc->peer.transport_addr_list, transports) { while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* Mark as part of a failed message. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } } /* Throw away chunks that have been gap ACKed. */ list_for_each_safe(lchunk, temp, &q->sacked) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks in the retransmit queue. */ list_for_each_safe(lchunk, temp, &q->retransmit) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any chunks that are in the abandoned queue. */ list_for_each_safe(lchunk, temp, &q->abandoned) { list_del_init(lchunk); chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); sctp_chunk_free(chunk); } /* Throw away any leftover control chunks. */ list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } void sctp_outq_teardown(struct sctp_outq *q) { __sctp_outq_teardown(q); sctp_outq_init(q->asoc, q); } /* Free the outqueue structure and any related pending chunks. */ void sctp_outq_free(struct sctp_outq *q) { /* Throw away leftover chunks. */ __sctp_outq_teardown(q); } /* Put a new chunk in an sctp_outq. */ void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = q->asoc->base.net; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); /* If it is data, queue it up, otherwise, send it * immediately. */ if (sctp_chunk_is_data(chunk)) { pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk"); sctp_outq_tail_data(q, chunk); if (chunk->asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) chunk->asoc->sent_cnt_removable++; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); else SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS); } else { list_add_tail(&chunk->list, &q->control_chunk_list); SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); } if (!q->cork) sctp_outq_flush(q, 0, gfp); } /* Insert a chunk into the sorted list based on the TSNs. The retransmit list * and the abandoned list are in ascending order. */ static void sctp_insert_list(struct list_head *head, struct list_head *new) { struct list_head *pos; struct sctp_chunk *nchunk, *lchunk; __u32 ntsn, ltsn; int done = 0; nchunk = list_entry(new, struct sctp_chunk, transmitted_list); ntsn = ntohl(nchunk->subh.data_hdr->tsn); list_for_each(pos, head) { lchunk = list_entry(pos, struct sctp_chunk, transmitted_list); ltsn = ntohl(lchunk->subh.data_hdr->tsn); if (TSN_lt(ntsn, ltsn)) { list_add(new, pos->prev); done = 1; break; } } if (!done) list_add_tail(new, head); } static int sctp_prsctp_prune_sent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct list_head *queue, int msg_len) { struct sctp_chunk *chk, *temp; list_for_each_entry_safe(chk, temp, queue, transmitted_list) { struct sctp_stream_out *streamout; if (!chk->msg->abandoned && (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; list_del_init(&chk->transmitted_list); sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (queue != &asoc->outqueue.retransmit && !chk->tsn_gap_acked) { if (chk->transport) chk->transport->flight_size -= sctp_data_size(chk); asoc->outqueue.outstanding_bytes -= sctp_data_size(chk); } msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); if (msg_len <= 0) break; } return msg_len; } static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; struct sctp_stream_out *sout; q->sched->unsched_all(&asoc->stream); list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!chk->msg->abandoned && (!(chk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) || !SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)) continue; chk->msg->abandoned = 1; sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; sout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); sout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; /* clear out_curr if all frag chunks are pruned */ if (asoc->stream.out_curr == sout && list_is_last(&chk->frag_list, &chk->msg->chunks)) asoc->stream.out_curr = NULL; msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk); sctp_chunk_free(chk); if (msg_len <= 0) break; } q->sched->sched_all(&asoc->stream); return msg_len; } /* Abandon the chunks according their priorities */ void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len) { struct sctp_transport *transport; if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable) return; msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &asoc->outqueue.retransmit, msg_len); if (msg_len <= 0) return; list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { msg_len = sctp_prsctp_prune_sent(asoc, sinfo, &transport->transmitted, msg_len); if (msg_len <= 0) return; } sctp_prsctp_prune_unsent(asoc, sinfo, msg_len); } /* Mark all the eligible packets on a transport for retransmission. */ void sctp_retransmit_mark(struct sctp_outq *q, struct sctp_transport *transport, __u8 reason) { struct list_head *lchunk, *ltemp; struct sctp_chunk *chunk; /* Walk through the specified transmitted queue. */ list_for_each_safe(lchunk, ltemp, &transport->transmitted) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(lchunk); sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been previousely acked, * stop considering it 'outstanding'. Our peer * will most likely never see it since it will * not be retransmitted */ if (!chunk->tsn_gap_acked) { if (chunk->transport) chunk->transport->flight_size -= sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); q->asoc->peer.rwnd += sctp_data_size(chunk); } continue; } /* If we are doing retransmission due to a timeout or pmtu * discovery, only the chunks that are not yet acked should * be added to the retransmit queue. */ if ((reason == SCTP_RTXR_FAST_RTX && (chunk->fast_retransmit == SCTP_NEED_FRTX)) || (reason != SCTP_RTXR_FAST_RTX && !chunk->tsn_gap_acked)) { /* RFC 2960 6.2.1 Processing a Received SACK * * C) Any time a DATA chunk is marked for * retransmission (via either T3-rtx timer expiration * (Section 6.3.3) or via fast retransmit * (Section 7.2.4)), add the data size of those * chunks to the rwnd. */ q->asoc->peer.rwnd += sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); if (chunk->transport) transport->flight_size -= sctp_data_size(chunk); /* sctpimpguide-05 Section 2.8.2 * M5) If a T3-rtx timer expires, the * 'TSN.Missing.Report' of all affected TSNs is set * to 0. */ chunk->tsn_missing_report = 0; /* If a chunk that is being used for RTT measurement * has to be retransmitted, we cannot use this chunk * anymore for RTT measurements. Reset rto_pending so * that a new RTT measurement is started when a new * data chunk is sent. */ if (chunk->rtt_in_progress) { chunk->rtt_in_progress = 0; transport->rto_pending = 0; } /* Move the chunk to the retransmit queue. The chunks * on the retransmit queue are always kept in order. */ list_del_init(lchunk); sctp_insert_list(&q->retransmit, lchunk); } } pr_debug("%s: transport:%p, reason:%d, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, reason, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } /* Mark all the eligible packets on a transport for retransmission and force * one packet out. */ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason) { struct net *net = q->asoc->base.net; switch (reason) { case SCTP_RTXR_T3_RTX: SCTP_INC_STATS(net, SCTP_MIB_T3_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX); /* Update the retran path if the T3-rtx timer has expired for * the current retran path. */ if (transport == transport->asoc->peer.retran_path) sctp_assoc_update_retran_path(transport->asoc); transport->asoc->rtx_data_chunks += transport->asoc->unack_data; if (transport->pl.state == SCTP_PL_COMPLETE && transport->asoc->unack_data) sctp_transport_reset_probe_timer(transport); break; case SCTP_RTXR_FAST_RTX: SCTP_INC_STATS(net, SCTP_MIB_FAST_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX); q->fast_rtx = 1; break; case SCTP_RTXR_PMTUD: SCTP_INC_STATS(net, SCTP_MIB_PMTUD_RETRANSMITS); break; case SCTP_RTXR_T1_RTX: SCTP_INC_STATS(net, SCTP_MIB_T1_RETRANSMITS); transport->asoc->init_retries++; break; default: BUG(); } sctp_retransmit_mark(q, transport, reason); /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination, * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by * following the procedures outlined in C1 - C5. */ if (reason == SCTP_RTXR_T3_RTX) q->asoc->stream.si->generate_ftsn(q, q->asoc->ctsn_ack_point); /* Flush the queues only on timeout, since fast_rtx is only * triggered during sack processing and the queue * will be flushed at the end. */ if (reason != SCTP_RTXR_FAST_RTX) sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); } /* * Transmit DATA chunks on the retransmit queue. Upon return from * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which * need to be transmitted by the caller. * We assume that pkt->transport has already been set. * * The return value is a normal kernel error return value. */ static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, int rtx_timeout, int *start_timer, gfp_t gfp) { struct sctp_transport *transport = pkt->transport; struct sctp_chunk *chunk, *chunk1; struct list_head *lqueue; enum sctp_xmit status; int error = 0; int timer = 0; int done = 0; int fast_rtx; lqueue = &q->retransmit; fast_rtx = q->fast_rtx; /* This loop handles time-out retransmissions, fast retransmissions, * and retransmissions due to opening of whindow. * * RFC 2960 6.3.3 Handle T3-rtx Expiration * * E3) Determine how many of the earliest (i.e., lowest TSN) * outstanding DATA chunks for the address for which the * T3-rtx has expired will fit into a single packet, subject * to the MTU constraint for the path corresponding to the * destination transport address to which the retransmission * is being sent (this may be different from the address for * which the timer expires [see Section 6.4]). Call this value * K. Bundle and retransmit those K DATA chunks in a single * packet to the destination endpoint. * * [Just to be painfully clear, if we are retransmitting * because a timeout just happened, we should send only ONE * packet of retransmitted data.] * * For fast retransmissions we also send only ONE packet. However, * if we are just flushing the queue due to open window, we'll * try to send as much as possible. */ list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) { /* If the chunk is abandoned, move it to abandoned list. */ if (sctp_chunk_abandoned(chunk)) { list_del_init(&chunk->transmitted_list); sctp_insert_list(&q->abandoned, &chunk->transmitted_list); continue; } /* Make sure that Gap Acked TSNs are not retransmitted. A * simple approach is just to move such TSNs out of the * way and into a 'transmitted' queue and skip to the * next chunk. */ if (chunk->tsn_gap_acked) { list_move_tail(&chunk->transmitted_list, &transport->transmitted); continue; } /* If we are doing fast retransmit, ignore non-fast_rtransmit * chunks */ if (fast_rtx && !chunk->fast_retransmit) continue; redo: /* Attempt to append this chunk to the packet. */ status = sctp_packet_append_chunk(pkt, chunk); switch (status) { case SCTP_XMIT_PMTU_FULL: if (!pkt->has_data && !pkt->has_cookie_echo) { /* If this packet did not contain DATA then * retransmission did not happen, so do it * again. We'll ignore the error here since * control chunks are already freed so there * is nothing we can do. */ sctp_packet_transmit(pkt, gfp); goto redo; } /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* If we are retransmitting, we should only * send a single packet. * Otherwise, try appending this chunk again. */ if (rtx_timeout || fast_rtx) done = 1; else goto redo; /* Bundle next chunk in the next round. */ break; case SCTP_XMIT_RWND_FULL: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA as there is no more room * at the receiver. */ done = 1; break; case SCTP_XMIT_DELAY: /* Send this packet. */ error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA because of nagle delay. */ done = 1; break; default: /* The append was successful, so add this chunk to * the transmitted list. */ list_move_tail(&chunk->transmitted_list, &transport->transmitted); /* Mark the chunk as ineligible for fast retransmit * after it is retransmitted. */ if (chunk->fast_retransmit == SCTP_NEED_FRTX) chunk->fast_retransmit = SCTP_DONT_FRTX; q->asoc->stats.rtxchunks++; break; } /* Set the timer if there were no errors */ if (!error && !timer) timer = 1; if (done) break; } /* If we are here due to a retransmit timeout or a fast * retransmit and if there are any chunks left in the retransmit * queue that could not fit in the PMTU sized packet, they need * to be marked as ineligible for a subsequent fast retransmit. */ if (rtx_timeout || fast_rtx) { list_for_each_entry(chunk1, lqueue, transmitted_list) { if (chunk1->fast_retransmit == SCTP_NEED_FRTX) chunk1->fast_retransmit = SCTP_DONT_FRTX; } } *start_timer = timer; /* Clear fast retransmit hint */ if (fast_rtx) q->fast_rtx = 0; return error; } /* Cork the outqueue so queued chunks are really queued. */ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; sctp_outq_flush(q, 0, gfp); } static int sctp_packet_singleton(struct sctp_transport *transport, struct sctp_chunk *chunk, gfp_t gfp) { const struct sctp_association *asoc = transport->asoc; const __u16 sport = asoc->base.bind_addr.port; const __u16 dport = asoc->peer.port; const __u32 vtag = asoc->peer.i.init_tag; struct sctp_packet singleton; sctp_packet_init(&singleton, transport, sport, dport); sctp_packet_config(&singleton, vtag, 0); if (sctp_packet_append_chunk(&singleton, chunk) != SCTP_XMIT_OK) { list_del_init(&chunk->list); sctp_chunk_free(chunk); return -ENOMEM; } return sctp_packet_transmit(&singleton, gfp); } /* Struct to hold the context during sctp outq flush */ struct sctp_flush_ctx { struct sctp_outq *q; /* Current transport being used. It's NOT the same as curr active one */ struct sctp_transport *transport; /* These transports have chunks to send. */ struct list_head transport_list; struct sctp_association *asoc; /* Packet on the current transport above */ struct sctp_packet *packet; gfp_t gfp; }; /* transport: current transport */ static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx, struct sctp_chunk *chunk) { struct sctp_transport *new_transport = chunk->transport; if (!new_transport) { if (!sctp_chunk_is_data(chunk)) { /* If we have a prior transport pointer, see if * the destination address of the chunk * matches the destination address of the * current transport. If not a match, then * try to look up the transport with a given * destination address. We do this because * after processing ASCONFs, we may have new * transports created. */ if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest, &ctx->transport->ipaddr)) new_transport = ctx->transport; else new_transport = sctp_assoc_lookup_paddr(ctx->asoc, &chunk->dest); } /* if we still don't have a new transport, then * use the current active path. */ if (!new_transport) new_transport = ctx->asoc->peer.active_path; } else { __u8 type; switch (new_transport->state) { case SCTP_INACTIVE: case SCTP_UNCONFIRMED: case SCTP_PF: /* If the chunk is Heartbeat or Heartbeat Ack, * send it to chunk->transport, even if it's * inactive. * * 3.3.6 Heartbeat Acknowledgement: * ... * A HEARTBEAT ACK is always sent to the source IP * address of the IP datagram containing the * HEARTBEAT chunk to which this ack is responding. * ... * * ASCONF_ACKs also must be sent to the source. */ type = chunk->chunk_hdr->type; if (type != SCTP_CID_HEARTBEAT && type != SCTP_CID_HEARTBEAT_ACK && type != SCTP_CID_ASCONF_ACK) new_transport = ctx->asoc->peer.active_path; break; default: break; } } /* Are we switching transports? Take care of transport locks. */ if (new_transport != ctx->transport) { ctx->transport = new_transport; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); /* We've switched transports, so apply the * Burst limit to the new transport. */ sctp_transport_burst_limited(ctx->transport); } } static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) { struct sctp_chunk *chunk, *tmp; enum sctp_xmit status; int one_packet, error; list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) { one_packet = 0; /* RFC 5061, 5.3 * F1) This means that until such time as the ASCONF * containing the add is acknowledged, the sender MUST * NOT use the new IP address as a source for ANY SCTP * packet except on carrying an ASCONF Chunk. */ if (ctx->asoc->src_out_of_asoc_ok && chunk->chunk_hdr->type != SCTP_CID_ASCONF) continue; list_del_init(&chunk->list); /* Pick the right transport to use. Should always be true for * the first chunk as we don't have a transport by then. */ sctp_outq_select_transport(ctx, chunk); switch (chunk->chunk_hdr->type) { /* 6.10 Bundling * ... * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN * COMPLETE with any other chunks. [Send them immediately.] */ case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (error < 0) { ctx->asoc->base.sk->sk_err = -error; return; } ctx->asoc->stats.octrlchunks++; break; case SCTP_CID_ABORT: if (sctp_test_T_bit(chunk)) ctx->packet->vtag = ctx->asoc->c.my_vtag; fallthrough; /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can * send only 1 packet containing these chunks. */ case SCTP_CID_HEARTBEAT_ACK: case SCTP_CID_SHUTDOWN_ACK: case SCTP_CID_COOKIE_ACK: case SCTP_CID_COOKIE_ECHO: case SCTP_CID_ERROR: case SCTP_CID_ECN_CWR: case SCTP_CID_ASCONF_ACK: one_packet = 1; fallthrough; case SCTP_CID_HEARTBEAT: if (chunk->pmtu_probe) { error = sctp_packet_singleton(ctx->transport, chunk, ctx->gfp); if (!error) ctx->asoc->stats.octrlchunks++; break; } fallthrough; case SCTP_CID_SACK: case SCTP_CID_SHUTDOWN: case SCTP_CID_ECN_ECNE: case SCTP_CID_ASCONF: case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: case SCTP_CID_RECONF: status = sctp_packet_transmit_chunk(ctx->packet, chunk, one_packet, ctx->gfp); if (status != SCTP_XMIT_OK) { /* put the chunk back */ list_add(&chunk->list, &ctx->q->control_chunk_list); break; } ctx->asoc->stats.octrlchunks++; /* PR-SCTP C5) If a FORWARD TSN is sent, the * sender MUST assure that at least one T3-rtx * timer is running. */ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } if (chunk == ctx->asoc->strreset_chunk) sctp_transport_reset_reconf_timer(ctx->transport); break; default: /* We built a chunk with an illegal type! */ BUG(); } } } /* Returns false if new data shouldn't be sent */ static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx, int rtx_timeout) { int error, start_timer = 0; if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED) return false; if (ctx->transport != ctx->asoc->peer.retran_path) { /* Switch transports & prepare the packet. */ ctx->transport = ctx->asoc->peer.retran_path; ctx->packet = &ctx->transport->packet; if (list_empty(&ctx->transport->send_ready)) list_add_tail(&ctx->transport->send_ready, &ctx->transport_list); sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, ctx->asoc->peer.ecn_capable); } error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout, &start_timer, ctx->gfp); if (error < 0) ctx->asoc->base.sk->sk_err = -error; if (start_timer) { sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; } /* This can happen on COOKIE-ECHO resend. Only * one chunk can get bundled with a COOKIE-ECHO. */ if (ctx->packet->has_cookie_echo) return false; /* Don't send new data if there is still data * waiting to retransmit. */ if (!list_empty(&ctx->q->retransmit)) return false; return true; } static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, int rtx_timeout) { struct sctp_chunk *chunk; enum sctp_xmit status; /* Is it OK to send data chunks? */ switch (ctx->asoc->state) { case SCTP_STATE_COOKIE_ECHOED: /* Only allow bundling when this packet has a COOKIE-ECHO * chunk. */ if (!ctx->packet || !ctx->packet->has_cookie_echo) return; fallthrough; case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: break; default: /* Do nothing. */ return; } /* RFC 2960 6.1 Transmission of DATA Chunks * * C) When the time comes for the sender to transmit, * before sending new DATA chunks, the sender MUST * first transmit any outstanding DATA chunks which * are marked for retransmission (limited by the * current cwnd). */ if (!list_empty(&ctx->q->retransmit) && !sctp_outq_flush_rtx(ctx, rtx_timeout)) return; /* Apply Max.Burst limitation to the current transport in * case it will be used for new data. We are going to * rest it before we return, but we want to apply the limit * to the currently queued data. */ if (ctx->transport) sctp_transport_burst_limited(ctx->transport); /* Finally, transmit new packets. */ while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state; /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { sctp_sched_dequeue_done(ctx->q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; } if (stream_state == SCTP_STREAM_CLOSED) { sctp_outq_head_data(ctx->q, chunk); break; } sctp_outq_select_transport(ctx, chunk); pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n", __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ? sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : "illegal chunk", ntohl(chunk->subh.data_hdr->tsn), chunk->skb ? chunk->skb->head : NULL, chunk->skb ? refcount_read(&chunk->skb->users) : -1); /* Add the chunk to the packet. */ status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0, ctx->gfp); if (status != SCTP_XMIT_OK) { /* We could not append this chunk, so put * the chunk back on the output queue. */ pr_debug("%s: could not transmit tsn:0x%x, status:%d\n", __func__, ntohl(chunk->subh.data_hdr->tsn), status); sctp_outq_head_data(ctx->q, chunk); break; } /* The sender is in the SHUTDOWN-PENDING state, * The sender MAY set the I-bit in the DATA * chunk header. */ if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING) chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) ctx->asoc->stats.ouodchunks++; else ctx->asoc->stats.oodchunks++; /* Only now it's safe to consider this * chunk as sent, sched-wise. */ sctp_sched_dequeue_done(ctx->q, chunk); list_add_tail(&chunk->transmitted_list, &ctx->transport->transmitted); sctp_transport_reset_t3_rtx(ctx->transport); ctx->transport->last_time_sent = jiffies; /* Only let one DATA chunk get bundled with a * COOKIE-ECHO chunk. */ if (ctx->packet->has_cookie_echo) break; } } static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx) { struct sock *sk = ctx->asoc->base.sk; struct list_head *ltransport; struct sctp_packet *packet; struct sctp_transport *t; int error = 0; while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) { t = list_entry(ltransport, struct sctp_transport, send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) { rcu_read_lock(); if (t->dst && __sk_dst_get(sk) != t->dst) { dst_hold(t->dst); sk_setup_caps(sk, t->dst); } rcu_read_unlock(); error = sctp_packet_transmit(packet, ctx->gfp); if (error < 0) ctx->q->asoc->base.sk->sk_err = -error; } /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); } } /* Try to flush an outqueue. * * Description: Send everything in q which we legally can, subject to * congestion limitations. * * Note: This function can be called from multiple contexts so appropriate * locking concerns must be made. Today we use the sock lock to protect * this function. */ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_flush_ctx ctx = { .q = q, .transport = NULL, .transport_list = LIST_HEAD_INIT(ctx.transport_list), .asoc = q->asoc, .packet = NULL, .gfp = gfp, }; /* 6.10 Bundling * ... * When bundling control chunks with DATA chunks, an * endpoint MUST place control chunks first in the outbound * SCTP packet. The transmitter MUST transmit DATA chunks * within a SCTP packet in increasing order of TSN. * ... */ sctp_outq_flush_ctrl(&ctx); if (q->asoc->src_out_of_asoc_ok) goto sctp_flush_out; sctp_outq_flush_data(&ctx, rtx_timeout); sctp_flush_out: sctp_outq_flush_transports(&ctx); } /* Update unack_data based on the incoming SACK chunk */ static void sctp_sack_update_unack_data(struct sctp_association *assoc, struct sctp_sackhdr *sack) { union sctp_sack_variable *frags; __u16 unack_data; int i; unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1; frags = (union sctp_sack_variable *)(sack + 1); for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) { unack_data -= ((ntohs(frags[i].gab.end) - ntohs(frags[i].gab.start) + 1)); } assoc->unack_data = unack_data; } /* This is where we REALLY process a SACK. * * Process the SACK against the outqueue. Mostly, this just frees * things off the transmitted queue. */ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) { struct sctp_association *asoc = q->asoc; struct sctp_sackhdr *sack = chunk->subh.sack_hdr; struct sctp_transport *transport; struct sctp_chunk *tchunk = NULL; struct list_head *lchunk, *transport_list, *temp; __u32 sack_ctsn, ctsn, tsn; __u32 highest_tsn, highest_new_tsn; __u32 sack_a_rwnd; unsigned int outstanding; struct sctp_transport *primary = asoc->peer.primary_path; int count_of_newacks = 0; int gap_ack_blocks; u8 accum_moved = 0; /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list; /* SCTP path tracepoint for congestion control debugging. */ if (trace_sctp_probe_path_enabled()) { list_for_each_entry(transport, transport_list, transports) trace_sctp_probe_path(transport, asoc); } sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks; /* * SFR-CACC algorithm: * On receipt of a SACK the sender SHOULD execute the * following statements. * * 1) If the cumulative ack in the SACK passes next tsn_at_change * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for * all destinations. * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE * is set the receiver of the SACK MUST take the following actions: * * A) Initialize the cacc_saw_newack to 0 for all destination * addresses. * * Only bother if changeover_active is set. Otherwise, this is * totally suboptimal to do on every SACK. */ if (primary->cacc.changeover_active) { u8 clear_cycling = 0; if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { primary->cacc.changeover_active = 0; clear_cycling = 1; } if (clear_cycling || gap_ack_blocks) { list_for_each_entry(transport, transport_list, transports) { if (clear_cycling) transport->cacc.cycling_changeover = 0; if (gap_ack_blocks) transport->cacc.cacc_saw_newack = 0; } } } /* Get the highest TSN in the sack. */ highest_tsn = sack_ctsn; if (gap_ack_blocks) { union sctp_sack_variable *frags = (union sctp_sack_variable *)(sack + 1); highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end); } if (TSN_lt(asoc->highest_sacked, highest_tsn)) asoc->highest_sacked = highest_tsn; highest_new_tsn = sack_ctsn; /* Run through the retransmit queue. Credit bytes received * and free those chunks that we can. */ sctp_check_transmitted(q, &q->retransmit, NULL, NULL, sack, &highest_new_tsn); /* Run through the transmitted queue. * Credit bytes received and free those chunks which we can. * * This is a MASSIVE candidate for optimization. */ list_for_each_entry(transport, transport_list, transports) { sctp_check_transmitted(q, &transport->transmitted, transport, &chunk->source, sack, &highest_new_tsn); /* * SFR-CACC algorithm: * C) Let count_of_newacks be the number of * destinations for which cacc_saw_newack is set. */ if (transport->cacc.cacc_saw_newack) count_of_newacks++; } /* Move the Cumulative TSN Ack Point if appropriate. */ if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) { asoc->ctsn_ack_point = sack_ctsn; accum_moved = 1; } if (gap_ack_blocks) { if (asoc->fast_recovery && accum_moved) highest_new_tsn = highest_tsn; list_for_each_entry(transport, transport_list, transports) sctp_mark_missing(q, &transport->transmitted, transport, highest_new_tsn, count_of_newacks); } /* Update unack_data field in the assoc. */ sctp_sack_update_unack_data(asoc, sack); ctsn = asoc->ctsn_ack_point; /* Throw away stuff rotting on the sack queue. */ list_for_each_safe(lchunk, temp, &q->sacked) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(tchunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(&tchunk->transmitted_list); if (asoc->peer.prsctp_capable && SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) asoc->sent_cnt_removable--; sctp_chunk_free(tchunk); } } /* ii) Set rwnd equal to the newly received a_rwnd minus the * number of bytes still outstanding after processing the * Cumulative TSN Ack and the Gap Ack Blocks. */ sack_a_rwnd = ntohl(sack->a_rwnd); asoc->peer.zero_window_announced = !sack_a_rwnd; outstanding = q->outstanding_bytes; if (outstanding < sack_a_rwnd) sack_a_rwnd -= outstanding; else sack_a_rwnd = 0; asoc->peer.rwnd = sack_a_rwnd; asoc->stream.si->generate_ftsn(q, sack_ctsn); pr_debug("%s: sack cumulative tsn ack:0x%x\n", __func__, sack_ctsn); pr_debug("%s: cumulative tsn ack of assoc:%p is 0x%x, " "advertised peer ack point:0x%x\n", __func__, asoc, ctsn, asoc->adv_peer_ack_point); return sctp_outq_is_empty(q); } /* Is the outqueue empty? * The queue is empty when we have not pending data, no in-flight data * and nothing pending retransmissions. */ int sctp_outq_is_empty(const struct sctp_outq *q) { return q->out_qlen == 0 && q->outstanding_bytes == 0 && list_empty(&q->retransmit); } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Go through a transport's transmitted list or the association's retransmit * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked. * The retransmit list will not have an associated transport. * * I added coherent debug information output. --xguo * * Instead of printing 'sacked' or 'kept' for each TSN on the * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5. * KEPT TSN6-TSN7, etc. */ static void sctp_check_transmitted(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, union sctp_addr *saddr, struct sctp_sackhdr *sack, __u32 *highest_new_tsn_in_sack) { struct list_head *lchunk; struct sctp_chunk *tchunk; struct list_head tlist; __u32 tsn; __u32 sack_ctsn; __u32 rtt; __u8 restart_timer = 0; int bytes_acked = 0; int migrate_bytes = 0; bool forward_progress = false; sack_ctsn = ntohl(sack->cum_tsn_ack); INIT_LIST_HEAD(&tlist); /* The while loop will skip empty transmitted queues. */ while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) { tchunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); if (sctp_chunk_abandoned(tchunk)) { /* Move the chunk to abandoned list. */ sctp_insert_list(&q->abandoned, lchunk); /* If this chunk has not been acked, stop * considering it as 'outstanding'. */ if (transmitted_queue != &q->retransmit && !tchunk->tsn_gap_acked) { if (tchunk->transport) tchunk->transport->flight_size -= sctp_data_size(tchunk); q->outstanding_bytes -= sctp_data_size(tchunk); } continue; } tsn = ntohl(tchunk->subh.data_hdr->tsn); if (sctp_acked(sack, tsn)) { /* If this queue is the retransmit queue, the * retransmit timer has already reclaimed * the outstanding bytes for this chunk, so only * count bytes associated with a transport. */ if (transport && !tchunk->tsn_gap_acked) { /* If this chunk is being used for RTT * measurement, calculate the RTT and update * the RTO using this value. * * 6.3.1 C5) Karn's algorithm: RTT measurements * MUST NOT be made using packets that were * retransmitted (and thus for which it is * ambiguous whether the reply was for the * first instance of the packet or a later * instance). */ if (!sctp_chunk_retransmitted(tchunk) && tchunk->rtt_in_progress) { tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; sctp_transport_update_rto(transport, rtt); } if (TSN_lte(tsn, sack_ctsn)) { /* * SFR-CACC algorithm: * 2) If the SACK contains gap acks * and the flag CHANGEOVER_ACTIVE is * set the receiver of the SACK MUST * take the following action: * * B) For each TSN t being acked that * has not been acked in any SACK so * far, set cacc_saw_newack to 1 for * the destination that the TSN was * sent to. */ if (sack->num_gap_ack_blocks && q->asoc->peer.primary_path->cacc. changeover_active) transport->cacc.cacc_saw_newack = 1; } } /* If the chunk hasn't been marked as ACKED, * mark it and account bytes_acked if the * chunk had a valid transport (it will not * have a transport if ASCONF had deleted it * while DATA was outstanding). */ if (!tchunk->tsn_gap_acked) { tchunk->tsn_gap_acked = 1; if (TSN_lt(*highest_new_tsn_in_sack, tsn)) *highest_new_tsn_in_sack = tsn; bytes_acked += sctp_data_size(tchunk); if (!tchunk->transport) migrate_bytes += sctp_data_size(tchunk); forward_progress = true; } if (TSN_lte(tsn, sack_ctsn)) { /* RFC 2960 6.3.2 Retransmission Timer Rules * * R3) Whenever a SACK is received * that acknowledges the DATA chunk * with the earliest outstanding TSN * for that address, restart T3-rtx * timer for that address with its * current RTO. */ restart_timer = 1; forward_progress = true; list_add_tail(&tchunk->transmitted_list, &q->sacked); } else { /* RFC2960 7.2.4, sctpimpguide-05 2.8.2 * M2) Each time a SACK arrives reporting * 'Stray DATA chunk(s)' record the highest TSN * reported as newly acknowledged, call this * value 'HighestTSNinSack'. A newly * acknowledged DATA chunk is one not * previously acknowledged in a SACK. * * When the SCTP sender of data receives a SACK * chunk that acknowledges, for the first time, * the receipt of a DATA chunk, all the still * unacknowledged DATA chunks whose TSN is * older than that newly acknowledged DATA * chunk, are qualified as 'Stray DATA chunks'. */ list_add_tail(lchunk, &tlist); } } else { if (tchunk->tsn_gap_acked) { pr_debug("%s: receiver reneged on data TSN:0x%x\n", __func__, tsn); tchunk->tsn_gap_acked = 0; if (tchunk->transport) bytes_acked -= sctp_data_size(tchunk); /* RFC 2960 6.3.2 Retransmission Timer Rules * * R4) Whenever a SACK is received missing a * TSN that was previously acknowledged via a * Gap Ack Block, start T3-rtx for the * destination address to which the DATA * chunk was originally * transmitted if it is not already running. */ restart_timer = 1; } list_add_tail(lchunk, &tlist); } } if (transport) { if (bytes_acked) { struct sctp_association *asoc = transport->asoc; /* We may have counted DATA that was migrated * to this transport due to DEL-IP operation. * Subtract those bytes, since the were never * send on this transport and shouldn't be * credited to this transport. */ bytes_acked -= migrate_bytes; /* 8.2. When an outstanding TSN is acknowledged, * the endpoint shall clear the error counter of * the destination transport address to which the * DATA chunk was last sent. * The association's overall error counter is * also cleared. */ transport->error_count = 0; transport->asoc->overall_error_count = 0; forward_progress = true; /* * While in SHUTDOWN PENDING, we may have started * the T5 shutdown guard timer after reaching the * retransmission limit. Stop that timer as soon * as the receiver acknowledged any data. */ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING && timer_delete(&asoc->timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) sctp_association_put(asoc); /* Mark the destination transport address as * active if it is not so marked. */ if ((transport->state == SCTP_INACTIVE || transport->state == SCTP_UNCONFIRMED) && sctp_cmp_addr_exact(&transport->ipaddr, saddr)) { sctp_assoc_control_transport( transport->asoc, transport, SCTP_TRANSPORT_UP, SCTP_RECEIVED_SACK); } sctp_transport_raise_cwnd(transport, sack_ctsn, bytes_acked); transport->flight_size -= bytes_acked; if (transport->flight_size == 0) transport->partial_bytes_acked = 0; q->outstanding_bytes -= bytes_acked + migrate_bytes; } else { /* RFC 2960 6.1, sctpimpguide-06 2.15.2 * When a sender is doing zero window probing, it * should not timeout the association if it continues * to receive new packets from the receiver. The * reason is that the receiver MAY keep its window * closed for an indefinite time. * A sender is doing zero window probing when the * receiver's advertised window is zero, and there is * only one data chunk in flight to the receiver. * * Allow the association to timeout while in SHUTDOWN * PENDING or SHUTDOWN RECEIVED in case the receiver * stays in zero window mode forever. */ if (!q->asoc->peer.rwnd && !list_empty(&tlist) && (sack_ctsn+2 == q->asoc->next_tsn) && q->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) { pr_debug("%s: sack received for zero window " "probe:%u\n", __func__, sack_ctsn); q->asoc->overall_error_count = 0; transport->error_count = 0; } } /* RFC 2960 6.3.2 Retransmission Timer Rules * * R2) Whenever all outstanding data sent to an address have * been acknowledged, turn off the T3-rtx timer of that * address. */ if (!transport->flight_size) { if (timer_delete(&transport->T3_rtx_timer)) sctp_transport_put(transport); } else if (restart_timer) { if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); } if (forward_progress) { if (transport->dst) sctp_transport_dst_confirm(transport); } } list_splice(&tlist, transmitted_queue); } /* Mark chunks as missing and consequently may get retransmitted. */ static void sctp_mark_missing(struct sctp_outq *q, struct list_head *transmitted_queue, struct sctp_transport *transport, __u32 highest_new_tsn_in_sack, int count_of_newacks) { struct sctp_chunk *chunk; __u32 tsn; char do_fast_retransmit = 0; struct sctp_association *asoc = q->asoc; struct sctp_transport *primary = asoc->peer.primary_path; list_for_each_entry(chunk, transmitted_queue, transmitted_list) { tsn = ntohl(chunk->subh.data_hdr->tsn); /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all * 'Unacknowledged TSN's', if the TSN number of an * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack' * value, increment the 'TSN.Missing.Report' count on that * chunk if it has NOT been fast retransmitted or marked for * fast retransmit already. */ if (chunk->fast_retransmit == SCTP_CAN_FRTX && !chunk->tsn_gap_acked && TSN_lt(tsn, highest_new_tsn_in_sack)) { /* SFR-CACC may require us to skip marking * this chunk as missing. */ if (!transport || !sctp_cacc_skip(primary, chunk->transport, count_of_newacks, tsn)) { chunk->tsn_missing_report++; pr_debug("%s: tsn:0x%x missing counter:%d\n", __func__, tsn, chunk->tsn_missing_report); } } /* * M4) If any DATA chunk is found to have a * 'TSN.Missing.Report' * value larger than or equal to 3, mark that chunk for * retransmission and start the fast retransmit procedure. */ if (chunk->tsn_missing_report >= 3) { chunk->fast_retransmit = SCTP_NEED_FRTX; do_fast_retransmit = 1; } } if (transport) { if (do_fast_retransmit) sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX); pr_debug("%s: transport:%p, cwnd:%d, ssthresh:%d, " "flight_size:%d, pba:%d\n", __func__, transport, transport->cwnd, transport->ssthresh, transport->flight_size, transport->partial_bytes_acked); } } /* Is the given TSN acked by this packet? */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) { __u32 ctsn = ntohl(sack->cum_tsn_ack); union sctp_sack_variable *frags; __u16 tsn_offset, blocks; int i; if (TSN_lte(tsn, ctsn)) goto pass; /* 3.3.4 Selective Acknowledgment (SACK) (3): * * Gap Ack Blocks: * These fields contain the Gap Ack Blocks. They are repeated * for each Gap Ack Block up to the number of Gap Ack Blocks * defined in the Number of Gap Ack Blocks field. All DATA * chunks with TSNs greater than or equal to (Cumulative TSN * Ack + Gap Ack Block Start) and less than or equal to * (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack * Block are assumed to have been received correctly. */ frags = (union sctp_sack_variable *)(sack + 1); blocks = ntohs(sack->num_gap_ack_blocks); tsn_offset = tsn - ctsn; for (i = 0; i < blocks; ++i) { if (tsn_offset >= ntohs(frags[i].gab.start) && tsn_offset <= ntohs(frags[i].gab.end)) goto pass; } return 0; pass: return 1; } static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, int nskips, __be16 stream) { int i; for (i = 0; i < nskips; i++) { if (skiplist[i].stream == stream) return i; } return i; } /* Create and add a fwdtsn chunk to the outq's control queue if needed. */ void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct sctp_fwdtsn_skip ftsn_skip_arr[10]; int nskips = 0; int skip_pos = 0; __u32 tsn; struct sctp_chunk *chunk; struct list_head *lchunk, *temp; if (!asoc->peer.prsctp_capable) return; /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the * received SACK. * * If (Advanced.Peer.Ack.Point < SackCumAck), then update * Advanced.Peer.Ack.Point to be equal to SackCumAck. */ if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point" * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as * the chunk next in the out-queue space is marked as "abandoned" as * shown in the following example: * * Assuming that a SACK arrived with the Cumulative TSN ACK 102 * and the Advanced.Peer.Ack.Point is updated to this value: * * out-queue at the end of ==> out-queue after Adv.Ack.Point * normal SACK processing local advancement * ... ... * Adv.Ack.Pt-> 102 acked 102 acked * 103 abandoned 103 abandoned * 104 abandoned Adv.Ack.P-> 104 abandoned * 105 105 * 106 acked 106 acked * ... ... * * In this example, the data sender successfully advanced the * "Advanced.Peer.Ack.Point" from 102 to 104 locally. */ list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); /* Remove any chunks in the abandoned queue that are acked by * the ctsn. */ if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else { if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) { asoc->adv_peer_ack_point = tsn; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) continue; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, chunk->subh.data_hdr->stream); ftsn_skip_arr[skip_pos].stream = chunk->subh.data_hdr->stream; ftsn_skip_arr[skip_pos].ssn = chunk->subh.data_hdr->ssn; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else break; } } /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point" * is greater than the Cumulative TSN ACK carried in the received * SACK, the data sender MUST send the data receiver a FORWARD TSN * chunk containing the latest value of the * "Advanced.Peer.Ack.Point". * * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD * list each stream and sequence number in the forwarded TSN. This * information will enable the receiver to easily find any * stranded TSN's waiting on stream reorder queues. Each stream * SHOULD only be reported once; this means that if multiple * abandoned messages occur in the same stream then only the * highest abandoned stream sequence number is reported. If the * total size of the FORWARD TSN does NOT fit in a single MTU then * the sender of the FORWARD TSN SHOULD lower the * Advanced.Peer.Ack.Point to the last TSN that will fit in a * single MTU. */ if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } }
1 1 1 4 4 1 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct privflags_req_info { struct ethnl_req_info base; }; struct privflags_reply_data { struct ethnl_reply_data base; const char (*priv_flag_names)[ETH_GSTRING_LEN]; unsigned int n_priv_flags; u32 priv_flags; }; #define PRIVFLAGS_REPDATA(__reply_base) \ container_of(__reply_base, struct privflags_reply_data, base) const struct nla_policy ethnl_privflags_get_policy[] = { [ETHTOOL_A_PRIVFLAGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int ethnl_get_priv_flags_info(struct net_device *dev, unsigned int *count, const char (**names)[ETH_GSTRING_LEN]) { const struct ethtool_ops *ops = dev->ethtool_ops; int nflags; nflags = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (nflags < 0) return nflags; if (names) { *names = kcalloc(nflags, ETH_GSTRING_LEN, GFP_KERNEL); if (!*names) return -ENOMEM; ops->get_strings(dev, ETH_SS_PRIV_FLAGS, (u8 *)*names); } /* We can pass more than 32 private flags to userspace via netlink but * we cannot get more with ethtool_ops::get_priv_flags(). Note that we * must not adjust nflags before allocating the space for flag names * as the buffer must be large enough for all flags. */ if (WARN_ONCE(nflags > 32, "device %s reports more than 32 private flags (%d)\n", netdev_name(dev), nflags)) nflags = 32; *count = nflags; return 0; } static int privflags_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const char (*names)[ETH_GSTRING_LEN]; const struct ethtool_ops *ops; unsigned int nflags; int ret; ops = dev->ethtool_ops; if (!ops->get_priv_flags || !ops->get_sset_count || !ops->get_strings) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = ethnl_get_priv_flags_info(dev, &nflags, &names); if (ret < 0) goto out_ops; data->priv_flags = ops->get_priv_flags(dev); data->priv_flag_names = names; data->n_priv_flags = nflags; out_ops: ethnl_ops_complete(dev); return ret; } static int privflags_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); return ethnl_bitset32_size(&data->priv_flags, &all_flags, data->n_priv_flags, data->priv_flag_names, compact); } static int privflags_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); return ethnl_put_bitset32(skb, ETHTOOL_A_PRIVFLAGS_FLAGS, &data->priv_flags, &all_flags, data->n_priv_flags, data->priv_flag_names, compact); } static void privflags_cleanup_data(struct ethnl_reply_data *reply_data) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_data); kfree(data->priv_flag_names); } /* PRIVFLAGS_SET */ const struct nla_policy ethnl_privflags_set_policy[] = { [ETHTOOL_A_PRIVFLAGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED }, }; static int ethnl_set_privflags_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!info->attrs[ETHTOOL_A_PRIVFLAGS_FLAGS]) return -EINVAL; if (!ops->get_priv_flags || !ops->set_priv_flags || !ops->get_sset_count || !ops->get_strings) return -EOPNOTSUPP; return 1; } static int ethnl_set_privflags(struct ethnl_req_info *req_info, struct genl_info *info) { const char (*names)[ETH_GSTRING_LEN] = NULL; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; unsigned int nflags; bool mod = false; bool compact; u32 flags; int ret; ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact); if (ret < 0) return ret; ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names); if (ret < 0) return ret; flags = dev->ethtool_ops->get_priv_flags(dev); ret = ethnl_update_bitset32(&flags, nflags, tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names, info->extack, &mod); if (ret < 0 || !mod) goto out_free; ret = dev->ethtool_ops->set_priv_flags(dev, flags); if (ret < 0) goto out_free; ret = 1; out_free: kfree(names); return ret; } const struct ethnl_request_ops ethnl_privflags_request_ops = { .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, .req_info_size = sizeof(struct privflags_req_info), .reply_data_size = sizeof(struct privflags_reply_data), .prepare_data = privflags_prepare_data, .reply_size = privflags_reply_size, .fill_reply = privflags_fill_reply, .cleanup_data = privflags_cleanup_data, .set_validate = ethnl_set_privflags_validate, .set = ethnl_set_privflags, .set_ntf_cmd = ETHTOOL_MSG_PRIVFLAGS_NTF, };
30 30 430 426 29 372 429 30 30 30 30 430 430 30 30 29 28 2 2 29 29 29 29 29 2 2 29 29 29 29 29 29 29 29 29 29 2 2 29 29 2 2 2 30 30 29 51 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/uverbs_ioctl.h> #include "rxe.h" #include "rxe_queue.h" #include "rxe_hw_counters.h" static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr); /* dev */ static int rxe_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (udata->inlen || udata->outlen) { rxe_dbg_dev(rxe, "malformed udata\n"); err = -EINVAL; goto err_out; } memcpy(attr, &rxe->attr, sizeof(*attr)); return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); struct net_device *ndev; int err, ret; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } ndev = rxe_ib_device_get_netdev(ibdev); if (!ndev) { err = -ENODEV; goto err_out; } memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, &attr->active_width); attr->state = ib_get_curr_port_state(ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); dev_put(ndev); return ret; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_gid(struct ib_device *ibdev, u32 port, int idx, union ib_gid *gid) { struct rxe_dev *rxe = to_rdev(ibdev); /* subnet_prefix == interface_id == 0; */ memset(gid, 0, sizeof(*gid)); memcpy(gid->raw, rxe->raw_gid, ETH_ALEN); return 0; } static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (index != 0) { err = -EINVAL; rxe_dbg_dev(rxe, "bad pkey index = %d\n", index); goto err_out; } *pkey = IB_DEFAULT_PKEY_FULL; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | IB_DEVICE_MODIFY_NODE_DESC)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); if (mask & IB_DEVICE_MODIFY_NODE_DESC) { memcpy(rxe->ib_dev.node_desc, attr->node_desc, sizeof(rxe->ib_dev.node_desc)); } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_port(struct ib_device *ibdev, u32 port_num, int mask, struct ib_port_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); struct rxe_port *port; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } //TODO is shutdown useful if (mask & ~(IB_PORT_RESET_QKEY_CNTR)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } port = &rxe->port; port->attr.port_cap_flags |= attr->set_port_cap_mask; port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; if (mask & IB_PORT_RESET_QKEY_CNTR) port->attr.qkey_viol_cntr = 0; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static enum rdma_link_layer rxe_get_link_layer(struct ib_device *ibdev, u32 port_num) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } return IB_LINK_LAYER_ETHERNET; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { struct rxe_dev *rxe = to_rdev(ibdev); struct ib_port_attr attr = {}; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } err = ib_query_port(ibdev, port_num, &attr); if (err) goto err_out; immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } /* uc */ static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibuc->device); struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_add_to_pool(&rxe->uc_pool, uc); if (err) rxe_err_dev(rxe, "unable to create uc\n"); return err; } static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_cleanup(uc); if (err) rxe_err_uc(uc, "cleanup failed, err = %d\n", err); } /* pd */ static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_add_to_pool(&rxe->pd_pool, pd); if (err) { rxe_dbg_dev(rxe, "unable to alloc pd\n"); goto err_out; } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_cleanup(pd); if (err) rxe_err_pd(pd, "cleanup failed, err = %d\n", err); return 0; } /* ah */ static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); struct rxe_create_ah_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { /* test if new user provider */ if (udata->outlen >= sizeof(*uresp)) uresp = udata->outbuf; ah->is_user = true; } else { ah->is_user = false; } err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) { rxe_dbg_dev(rxe, "unable to create ah\n"); goto err_out; } /* create index > 0 */ ah->ah_num = ah->elem.index; err = rxe_ah_chk_attr(ah, init_attr->ah_attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_cleanup; } if (uresp) { /* only if new user provider */ err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { err = -EFAULT; rxe_dbg_ah(ah, "unable to copy to user\n"); goto err_cleanup; } } else if (ah->is_user) { /* only if old user provider */ ah->ah_num = 0; } rxe_init_av(init_attr->ah_attr, &ah->av); rxe_finalize(ah); return 0; err_cleanup: cleanup_err = rxe_cleanup(ah); if (cleanup_err) rxe_err_ah(ah, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_ah_chk_attr(ah, attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_out; } rxe_init_av(attr, &ah->av); return 0; err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); memset(attr, 0, sizeof(*attr)); attr->type = ibah->type; rxe_av_to_attr(&ah->av, attr); return 0; } static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); if (err) rxe_err_ah(ah, "cleanup failed, err = %d\n", err); return 0; } /* srq */ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_pd *pd = to_rpd(ibsrq->pd); struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_create_srq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_err_dev(rxe, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } if (init->srq_type != IB_SRQT_BASIC) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "srq type = %d, not supported\n", init->srq_type); goto err_out; } err = rxe_srq_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "invalid init attributes\n"); goto err_out; } err = rxe_add_to_pool(&rxe->srq_pool, srq); if (err) { rxe_dbg_dev(rxe, "unable to create srq, err = %d\n", err); goto err_out; } rxe_get(pd); srq->pd = pd; err = rxe_srq_from_init(rxe, srq, init, udata, uresp); if (err) { rxe_dbg_srq(srq, "create srq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(srq); if (cleanup_err) rxe_err_srq(srq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_modify_srq_cmd cmd = {}; int err; if (udata) { if (udata->inlen < sizeof(cmd)) { err = -EINVAL; rxe_dbg_srq(srq, "malformed udata\n"); goto err_out; } err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); if (err) { err = -EFAULT; rxe_dbg_srq(srq, "unable to read udata\n"); goto err_out; } } err = rxe_srq_chk_attr(rxe, srq, attr, mask); if (err) { rxe_dbg_srq(srq, "bad init attributes\n"); goto err_out; } err = rxe_srq_from_attr(rxe, srq, attr, mask, &cmd, udata); if (err) { rxe_dbg_srq(srq, "bad attr\n"); goto err_out; } return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; if (srq->error) { err = -EINVAL; rxe_dbg_srq(srq, "srq in error state\n"); goto err_out; } attr->max_wr = srq->rq.queue->buf->index_mask; attr->max_sge = srq->rq.max_sge; attr->srq_limit = srq->limit; return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_srq *srq = to_rsrq(ibsrq); unsigned long flags; spin_lock_irqsave(&srq->rq.producer_lock, flags); while (wr) { err = post_one_recv(&srq->rq, wr); if (unlikely(err)) break; wr = wr->next; } spin_unlock_irqrestore(&srq->rq.producer_lock, flags); if (err) { *bad_wr = wr; rxe_err_srq(srq, "returned err = %d\n", err); } return err; } static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; err = rxe_cleanup(srq); if (err) rxe_err_srq(srq, "cleanup failed, err = %d\n", err); return 0; } /* qp */ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_pd *pd = to_rpd(ibqp->pd); struct rxe_qp *qp = to_rqp(ibqp); struct rxe_create_qp_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->inlen) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } qp->is_user = true; uresp = udata->outbuf; } else { qp->is_user = false; } if (init->create_flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported create_flags, err = %d\n", err); goto err_out; } err = rxe_qp_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "bad init attr, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->qp_pool, qp); if (err) { rxe_dbg_dev(rxe, "unable to create qp, err = %d\n", err); goto err_out; } err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata); if (err) { rxe_dbg_qp(qp, "create qp failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(qp); return 0; err_cleanup: cleanup_err = rxe_cleanup(qp); if (cleanup_err) rxe_err_qp(qp, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); int err; if (mask & ~IB_QP_ATTR_STANDARD_BITS) { err = -EOPNOTSUPP; rxe_dbg_qp(qp, "unsupported mask = 0x%x, err = %d\n", mask, err); goto err_out; } err = rxe_qp_chk_attr(rxe, qp, attr, mask); if (err) { rxe_dbg_qp(qp, "bad mask/attr, err = %d\n", err); goto err_out; } err = rxe_qp_from_attr(qp, attr, mask, udata); if (err) { rxe_dbg_qp(qp, "modify qp failed, err = %d\n", err); goto err_out; } if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, qp->ibqp.qp_num, qp->attr.dest_qp_num); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_qp_init_attr *init) { struct rxe_qp *qp = to_rqp(ibqp); rxe_qp_to_init(qp, init); rxe_qp_to_attr(qp, attr, mask); return 0; } static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct rxe_qp *qp = to_rqp(ibqp); int err; err = rxe_qp_chk_destroy(qp); if (err) { rxe_dbg_qp(qp, "unable to destroy qp, err = %d\n", err); goto err_out; } err = rxe_cleanup(qp); if (err) rxe_err_qp(qp, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } /* send wr */ /* sanity check incoming send work request */ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int *maskp, unsigned int *lengthp) { int num_sge = ibwr->num_sge; struct rxe_sq *sq = &qp->sq; unsigned int mask = 0; unsigned long length = 0; int err = -EINVAL; int i; do { mask = wr_opcode_mask(ibwr->opcode, qp); if (!mask) { rxe_err_qp(qp, "bad wr opcode for qp type\n"); break; } if (num_sge > sq->max_sge) { rxe_err_qp(qp, "num_sge > max_sge\n"); break; } length = 0; for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; if (length > RXE_PORT_MAX_MSG_SZ) { rxe_err_qp(qp, "message length too long\n"); break; } if (mask & WR_ATOMIC_MASK) { if (length != 8) { rxe_err_qp(qp, "atomic length != 8\n"); break; } if (atomic_wr(ibwr)->remote_addr & 0x7) { rxe_err_qp(qp, "misaligned atomic address\n"); break; } } if (ibwr->send_flags & IB_SEND_INLINE) { if (!(mask & WR_INLINE_MASK)) { rxe_err_qp(qp, "opcode doesn't support inline data\n"); break; } if (length > sq->max_inline) { rxe_err_qp(qp, "inline length too big\n"); break; } } err = 0; } while (0); *maskp = mask; *lengthp = (int)length; return err; } static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, const struct ib_send_wr *ibwr) { wr->wr_id = ibwr->wr_id; wr->opcode = ibwr->opcode; wr->send_flags = ibwr->send_flags; if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) { struct ib_ah *ibah = ud_wr(ibwr)->ah; wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; wr->wr.ud.ah_num = to_rah(ibah)->ah_num; if (qp_type(qp) == IB_QPT_GSI) wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND: break; default: rxe_err_qp(qp, "bad wr opcode %d for UD/GSI QP\n", wr->opcode); return -EINVAL; } } else { switch (wr->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; fallthrough; case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_RDMA_READ_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: wr->wr.atomic.remote_addr = atomic_wr(ibwr)->remote_addr; wr->wr.atomic.compare_add = atomic_wr(ibwr)->compare_add; wr->wr.atomic.swap = atomic_wr(ibwr)->swap; wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; break; case IB_WR_LOCAL_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_REG_MR: wr->wr.reg.mr = reg_wr(ibwr)->mr; wr->wr.reg.key = reg_wr(ibwr)->key; wr->wr.reg.access = reg_wr(ibwr)->access; break; case IB_WR_SEND: case IB_WR_BIND_MW: case IB_WR_FLUSH: case IB_WR_ATOMIC_WRITE: break; default: rxe_err_qp(qp, "unsupported wr opcode %d\n", wr->opcode); return -EINVAL; } } return 0; } static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, const struct ib_send_wr *ibwr) { struct ib_sge *sge = ibwr->sg_list; u8 *p = wqe->dma.inline_data; int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length, struct rxe_send_wqe *wqe) { int num_sge = ibwr->num_sge; int err; err = init_send_wr(qp, &wqe->wr, ibwr); if (err) return err; /* local operation */ if (unlikely(mask & WR_LOCAL_OP_MASK)) { wqe->mask = mask; wqe->state = wqe_state_posted; return 0; } if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) copy_inline_data_to_wqe(wqe, ibwr); else memcpy(wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr : mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0; wqe->mask = mask; wqe->dma.length = length; wqe->dma.resid = length; wqe->dma.num_sge = num_sge; wqe->dma.cur_sge = 0; wqe->dma.sge_offset = 0; wqe->state = wqe_state_posted; wqe->ssn = atomic_add_return(1, &qp->ssn); return 0; } static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr) { int err; struct rxe_sq *sq = &qp->sq; struct rxe_send_wqe *send_wqe; unsigned int mask; unsigned int length; int full; err = validate_send_wr(qp, ibwr, &mask, &length); if (err) return err; full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { rxe_err_qp(qp, "send queue full\n"); return -ENOMEM; } send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); err = init_send_wqe(qp, ibwr, mask, length, send_wqe); if (!err) queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); return err; } static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *ibwr, const struct ib_send_wr **bad_wr) { int err = 0; unsigned long flags; int good = 0; spin_lock_irqsave(&qp->sq.sq_lock, flags); while (ibwr) { err = post_one_send(qp, ibwr); if (err) { *bad_wr = ibwr; break; } else { good++; } ibwr = ibwr->next; } spin_unlock_irqrestore(&qp->sq.sq_lock, flags); /* kickoff processing of any posted wqes */ if (good) rxe_sched_task(&qp->send_task); return err; } static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { struct rxe_qp *qp = to_rqp(ibqp); int err; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_err_qp(qp, "qp not ready to send\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->is_user) { /* Utilize process context to do protocol processing */ rxe_sched_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) return err; } return 0; } /* recv wr */ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) { int i; unsigned long length; struct rxe_recv_wqe *recv_wqe; int num_sge = ibwr->num_sge; int full; int err; full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { err = -ENOMEM; rxe_dbg("queue full\n"); goto err_out; } if (unlikely(num_sge > rq->max_sge)) { err = -EINVAL; rxe_dbg("bad num_sge > max_sge\n"); goto err_out; } length = 0; for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; if (length > RXE_PORT_MAX_MSG_SZ) { err = -EINVAL; rxe_dbg("message length too long\n"); goto err_out; } recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); recv_wqe->wr_id = ibwr->wr_id; recv_wqe->dma.length = length; recv_wqe->dma.resid = length; recv_wqe->dma.num_sge = num_sge; recv_wqe->dma.cur_sge = 0; recv_wqe->dma.sge_offset = 0; memcpy(recv_wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); return 0; err_out: rxe_dbg("returned err = %d\n", err); return err; } static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_qp *qp = to_rqp(ibqp); struct rxe_rq *rq = &qp->rq; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } /* see C10-97.2.1 */ if (unlikely((qp_state(qp) < IB_QPS_INIT))) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_dbg_qp(qp, "qp not ready to post recv\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(qp->srq)) { *bad_wr = wr; rxe_dbg_qp(qp, "qp has srq, use post_srq_recv instead\n"); return -EINVAL; } spin_lock_irqsave(&rq->producer_lock, flags); while (wr) { err = post_one_recv(rq, wr); if (unlikely(err)) { *bad_wr = wr; break; } wr = wr->next; } spin_unlock_irqrestore(&rq->producer_lock, flags); spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->recv_task); spin_unlock_irqrestore(&qp->state_lock, flags); return err; } /* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs) { struct ib_udata *udata = &attrs->driver_udata; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } uresp = udata->outbuf; } if (attr->flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "bad attr->flags, err = %d\n", err); goto err_out; } err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) { rxe_dbg_dev(rxe, "bad init attributes, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->cq_pool, cq); if (err) { rxe_dbg_dev(rxe, "unable to create cq, err = %d\n", err); goto err_out; } err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); if (err) { rxe_dbg_cq(cq, "create cq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(cq); if (cleanup_err) rxe_err_cq(cq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); struct rxe_resize_cq_resp __user *uresp = NULL; int err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_cq(cq, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) { rxe_dbg_cq(cq, "bad attr, err = %d\n", err); goto err_out; } err = rxe_cq_resize_queue(cq, cqe, uresp, udata); if (err) { rxe_dbg_cq(cq, "resize cq failed, err = %d\n", err); goto err_out; } return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { int i; struct rxe_cq *cq = to_rcq(ibcq); struct rxe_cqe *cqe; unsigned long flags; spin_lock_irqsave(&cq->cq_lock, flags); for (i = 0; i < num_entries; i++) { cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); if (!cqe) break; /* queue empty */ memcpy(wc++, &cqe->ibwc, sizeof(*wc)); queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); } spin_unlock_irqrestore(&cq->cq_lock, flags); return i; } static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) { struct rxe_cq *cq = to_rcq(ibcq); int count; count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP); return (count > wc_cnt) ? wc_cnt : count; } static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct rxe_cq *cq = to_rcq(ibcq); int ret = 0; int empty; unsigned long irq_flags; spin_lock_irqsave(&cq->cq_lock, irq_flags); cq->notify |= flags & IB_CQ_SOLICITED_MASK; empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) ret = 1; spin_unlock_irqrestore(&cq->cq_lock, irq_flags); return ret; } static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); int err; /* See IBA C11-17: The CI shall return an error if this Verb is * invoked while a Work Queue is still associated with the CQ. */ if (atomic_read(&cq->num_wq)) { err = -EINVAL; rxe_dbg_cq(cq, "still in use\n"); goto err_out; } err = rxe_cleanup(cq); if (err) rxe_err_cq(cq, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } /* mr */ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_dev(rxe, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; rxe_mr_init_dma(access, mr); rxe_finalize(mr); return &mr->ibmr; err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access, RXE_ACCESS_SUPPORTED_MR); return ERR_PTR(-EOPNOTSUPP); } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_pd(pd, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; if (access & IB_ACCESS_ON_DEMAND) err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr); else err = rxe_mr_init_user(rxe, start, length, access, mr); if (err) { rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, u64 iova, int access, struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); struct rxe_pd *old_pd = to_rpd(ibmr->pd); struct rxe_pd *pd = to_rpd(ibpd); /* for now only support the two easy cases: * rereg_pd and rereg_access */ if (flags & ~RXE_MR_REREG_SUPPORTED) { rxe_err_mr(mr, "flags = %#x not supported\n", flags); return ERR_PTR(-EOPNOTSUPP); } if (flags & IB_MR_REREG_PD) { rxe_put(old_pd); rxe_get(pd); mr->ibmr.pd = ibpd; } if (flags & IB_MR_REREG_ACCESS) { if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_mr(mr, "access = %#x not supported\n", access); return ERR_PTR(-EOPNOTSUPP); } mr->access = access; } return NULL; } static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (mr_type != IB_MR_TYPE_MEM_REG) { err = -EINVAL; rxe_dbg_pd(pd, "mr type %d not supported, err = %d\n", mr_type, err); goto err_out; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_fast(max_num_sg, mr); if (err) { rxe_dbg_mr(mr, "alloc_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", err); err_free: kfree(mr); err_out: rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); int err, cleanup_err; /* See IBA 10.6.7.2.6 */ if (atomic_read(&mr->num_mw) > 0) { err = -EINVAL; rxe_dbg_mr(mr, "mr has mw's bound\n"); goto err_out; } cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); kfree_rcu_mightsleep(mr); return 0; err_out: rxe_err_mr(mr, "returned err = %d\n", err); return err; } static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { struct rxe_dev *rxe = rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); return sysfs_emit(buf, "%s\n", rxe_parent_name(rxe, 1)); } static DEVICE_ATTR_RO(parent); static struct attribute *rxe_dev_attributes[] = { &dev_attr_parent.attr, NULL }; static const struct attribute_group rxe_attr_group = { .attrs = rxe_dev_attributes, }; static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); struct net_device *ndev; ndev = rxe_ib_device_get_netdev(ib_dev); if (!ndev) return -ENODEV; rxe_set_port_state(rxe); dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); dev_put(ndev); return 0; } static const struct ib_device_ops rxe_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_RXE, .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, .alloc_hw_port_stats = rxe_ib_alloc_hw_port_stats, .alloc_mr = rxe_alloc_mr, .alloc_mw = rxe_alloc_mw, .alloc_pd = rxe_alloc_pd, .alloc_ucontext = rxe_alloc_ucontext, .attach_mcast = rxe_attach_mcast, .create_ah = rxe_create_ah, .create_cq = rxe_create_cq, .create_qp = rxe_create_qp, .create_srq = rxe_create_srq, .create_user_ah = rxe_create_ah, .dealloc_driver = rxe_dealloc, .dealloc_mw = rxe_dealloc_mw, .dealloc_pd = rxe_dealloc_pd, .dealloc_ucontext = rxe_dealloc_ucontext, .dereg_mr = rxe_dereg_mr, .destroy_ah = rxe_destroy_ah, .destroy_cq = rxe_destroy_cq, .destroy_qp = rxe_destroy_qp, .destroy_srq = rxe_destroy_srq, .detach_mcast = rxe_detach_mcast, .device_group = &rxe_attr_group, .enable_driver = rxe_enable_driver, .get_dma_mr = rxe_get_dma_mr, .get_hw_stats = rxe_ib_get_hw_stats, .get_link_layer = rxe_get_link_layer, .get_port_immutable = rxe_port_immutable, .map_mr_sg = rxe_map_mr_sg, .mmap = rxe_mmap, .modify_ah = rxe_modify_ah, .modify_device = rxe_modify_device, .modify_port = rxe_modify_port, .modify_qp = rxe_modify_qp, .modify_srq = rxe_modify_srq, .peek_cq = rxe_peek_cq, .poll_cq = rxe_poll_cq, .post_recv = rxe_post_recv, .post_send = rxe_post_send, .post_srq_recv = rxe_post_srq_recv, .query_ah = rxe_query_ah, .query_device = rxe_query_device, .query_pkey = rxe_query_pkey, .query_gid = rxe_query_gid, .query_port = rxe_query_port, .query_qp = rxe_query_qp, .query_srq = rxe_query_srq, .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, .rereg_user_mr = rxe_rereg_user_mr, .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, rxe_qp, ibqp), INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; strscpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, rxe->raw_gid); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; err = ib_register_device(dev, ibdev_name, NULL); if (err) rxe_dbg_dev(rxe, "failed with error %d\n", err); /* * Note that rxe may be invalid at this point if another thread * unregistered it. */ return err; }
11255 12468 1139 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Sleepable Read-Copy Update mechanism for mutual exclusion * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU/ *.txt * */ #ifndef _LINUX_SRCU_H #define _LINUX_SRCU_H #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/rcu_segcblist.h> struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); #define init_srcu_struct(ssp) \ ({ \ static struct lock_class_key __srcu_key; \ \ __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). #define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). #define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above. #define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // Flavors requiring synchronize_rcu() // instead of smp_mb(). void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); #ifdef CONFIG_TINY_SRCU #include <linux/srcutiny.h> #elif defined(CONFIG_TREE_SRCU) #include <linux/srcutree.h> #else #error "Unknown SRCU implementation specified to kernel configuration" #endif void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); void cleanup_srcu_struct(struct srcu_struct *ssp); void synchronize_srcu(struct srcu_struct *ssp); #define SRCU_GET_STATE_COMPLETED 0x1 /** * get_completed_synchronize_srcu - Return a pre-completed polled state cookie * * Returns a value that poll_state_synchronize_srcu() will always treat * as a cookie whose grace period has already completed. */ static inline unsigned long get_completed_synchronize_srcu(void) { return SRCU_GET_STATE_COMPLETED; } unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); // Maximum number of unsigned long values corresponding to // not-yet-completed SRCU grace periods. #define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2 /** * same_state_synchronize_srcu - Are two old-state values identical? * @oldstate1: First old-state value. * @oldstate2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_srcu(), start_poll_synchronize_srcu(), or * get_completed_synchronize_srcu(). Returns @true if the two values are * identical and @false otherwise. This allows structures whose lifetimes * are tracked by old-state values to push these values to a list header, * allowing those structures to be slightly smaller. */ static inline bool same_state_synchronize_srcu(unsigned long oldstate1, unsigned long oldstate2) { return oldstate1 == oldstate2; } #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); #else static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { return __srcu_read_lock(ssp); } static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { __srcu_read_unlock(ssp, idx); } #endif /* CONFIG_NEED_SRCU_NMI_SAFE */ void srcu_init(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * srcu_read_lock_held - might we be in SRCU read-side critical section? * @ssp: The srcu_struct structure to check * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. * * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * * Note that SRCU is based on its own statemachine and it doesn't * relies on normal RCU, it can be called from the CPU which * is in the idle loop from an RCU point of view or offline. */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { if (!debug_lockdep_rcu_enabled()) return 1; return lock_is_held(&ssp->dep_map); } /* * Annotations provide deadlock detection for SRCU. * * Similar to other lockdep annotations, except there is an additional * srcu_lock_sync(), which is basically an empty *write*-side critical section, * see lock_sync() for more information. */ /* Annotates a srcu_read_lock() */ static inline void srcu_lock_acquire(struct lockdep_map *map) { lock_map_acquire_read(map); } /* Annotates a srcu_read_lock() */ static inline void srcu_lock_release(struct lockdep_map *map) { lock_map_release(map); } /* Annotates a synchronize_srcu() */ static inline void srcu_lock_sync(struct lockdep_map *map) { lock_map_sync(map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { return 1; } #define srcu_lock_acquire(m) do { } while (0) #define srcu_lock_release(m) do { } while (0) #define srcu_lock_sync(m) do { } while (0) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * @c: condition to check for update-side use * * If PROVE_RCU is enabled, invoking this outside of an RCU read-side * critical section will result in an RCU-lockdep splat, unless @c evaluates * to 1. The @c argument will normally be a logical expression containing * lockdep_is_held() calls. */ #define srcu_dereference_check(p, ssp, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || srcu_read_lock_held(ssp), __rcu) /** * srcu_dereference - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU * is enabled, invoking this outside of an RCU read-side critical * section will result in an RCU-lockdep splat. */ #define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0) /** * srcu_dereference_notrace - no tracing and no lockdep calls from here * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. */ #define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side * critical sections may be nested. However, it is illegal to * call anything that waits on an SRCU grace period for the same * srcu_struct, whether directly or indirectly. Please note that * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * * The return value from srcu_read_lock() is guaranteed to be * non-negative. This value must be passed unaltered to the matching * srcu_read_unlock(). Note that srcu_read_lock() and the matching * srcu_read_unlock() must occur in the same context, for example, it is * illegal to invoke srcu_read_unlock() in an irq handler if the matching * srcu_read_lock() was invoked in process context. Or, for that matter to * invoke srcu_read_unlock() from one task and the matching srcu_read_lock() * from another. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; } /** * srcu_read_lock_fast - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but for a light-weight * smp_mb()-free reader. See srcu_read_lock() for more information. * * If srcu_read_lock_fast() is ever used on an srcu_struct structure, * then none of the other flavors may be used, whether before, during, * or after. Note that grace-period auto-expediting is disabled for _fast * srcu_struct structures because auto-expedited grace periods invoke * synchronize_rcu_expedited(), IPIs and all. * * Note that srcu_read_lock_fast() can be invoked only from those contexts * where RCU is watching, that is, from contexts where it would be legal * to invoke rcu_read_lock(). Otherwise, lockdep will complain. */ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) { struct srcu_ctr __percpu *retval; srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; } /** * srcu_down_read_fast - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter a semaphore-like SRCU read-side critical section, but for * a light-weight smp_mb()-free reader. See srcu_read_lock_fast() and * srcu_down_read() for more information. * * The same srcu_struct may be used concurrently by srcu_down_read_fast() * and srcu_read_lock_fast(). */ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); return __srcu_read_lock_fast(ssp); } /** * srcu_read_lock_lite - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but for a light-weight * smp_mb()-free reader. See srcu_read_lock() for more information. * * If srcu_read_lock_lite() is ever used on an srcu_struct structure, * then none of the other flavors may be used, whether before, during, * or after. Note that grace-period auto-expediting is disabled for _lite * srcu_struct structures because auto-expedited grace periods invoke * synchronize_rcu_expedited(), IPIs and all. * * Note that srcu_read_lock_lite() can be invoked only from those contexts * where RCU is watching, that is, from contexts where it would be legal * to invoke rcu_read_lock(). Otherwise, lockdep will complain. */ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE); retval = __srcu_read_lock_lite(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; } /** * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but in an NMI-safe manner. * See srcu_read_lock() for more information. * * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure, * then none of the other flavors may be used, whether before, during, * or after. */ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); retval = __srcu_read_lock_nmisafe(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; } /* Used by tracing, cannot be traced and cannot invoke lockdep. */ static inline notrace int srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); retval = __srcu_read_lock(ssp); return retval; } /** * srcu_down_read - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter a semaphore-like SRCU read-side critical section. Note that * SRCU read-side critical sections may be nested. However, it is * illegal to call anything that waits on an SRCU grace period for the * same srcu_struct, whether directly or indirectly. Please note that * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). But if you want lockdep to help you * keep this stuff straight, you should instead use srcu_read_lock(). * * The semaphore-like nature of srcu_down_read() means that the matching * srcu_up_read() can be invoked from some other context, for example, * from some other task or from an irq handler. However, neither * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. * * Calls to srcu_down_read() may be nested, similar to the manner in * which calls to down_read() may be nested. The same srcu_struct may be * used concurrently by srcu_down_read() and srcu_read_lock(). */ static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); return __srcu_read_lock(ssp); } /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section. */ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } /** * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @scp: return value from corresponding srcu_read_lock_fast(). * * Exit a light-weight SRCU read-side critical section. */ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) __releases(ssp) { srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock_fast(ssp, scp); } /** * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @scp: return value from corresponding srcu_read_lock_fast(). * * Exit an SRCU read-side critical section, but not necessarily from * the same context as the maching srcu_down_read_fast(). */ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) __releases(ssp) { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); __srcu_read_unlock_fast(ssp, scp); } /** * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock_lite(). * * Exit a light-weight SRCU read-side critical section. */ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock_lite(ssp, idx); } /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock_nmisafe(). * * Exit an SRCU read-side critical section, but in an NMI-safe manner. */ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } /* Used by tracing, cannot be traced and cannot call lockdep. */ static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } /** * srcu_up_read - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section, but not necessarily from * the same context as the maching srcu_down_read(). */ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL); __srcu_read_unlock(ssp, idx); } /** * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock * * Converts the preceding srcu_read_unlock into a two-way memory barrier. * * Call this after srcu_read_unlock, to guarantee that all memory operations * that occur after smp_mb__after_srcu_read_unlock will appear to happen after * the preceding srcu_read_unlock. */ static inline void smp_mb__after_srcu_read_unlock(void) { /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */ } /** * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock * * Converts the preceding srcu_read_lock into a two-way memory barrier. * * Call this after srcu_read_lock, to guarantee that all memory operations * that occur after smp_mb__after_srcu_read_lock will appear to happen after * the preceding srcu_read_lock. */ static inline void smp_mb__after_srcu_read_lock(void) { /* __srcu_read_lock has smp_mb() internally so nothing to do here. */ } DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct, _T->idx = srcu_read_lock(_T->lock), srcu_read_unlock(_T->lock, _T->idx), int idx) #endif
74 8 16 123 45 218 219 25 34 447 42 46 46 109 109 12 12 2 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP module. * * Version: @(#)udp.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : Turned on udp checksums. I don't want to * chase 'memory corruption' bugs that aren't! */ #ifndef _UDP_H #define _UDP_H #include <linux/list.h> #include <linux/bug.h> #include <net/inet_sock.h> #include <net/gso.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/indirect_call_wrapper.h> /** * struct udp_skb_cb - UDP(-Lite) private variables * * @header: private variables used by IPv4/IPv6 * @cscov: checksum coverage length (UDP-Lite only) * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u16 cscov; __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4 * * @head: head of list of sockets * @nulls_head: head of list of sockets, only used by hash4 * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { union { struct hlist_head head; /* hash4 uses hlist_nulls to avoid moving wrongly onto another * hlist, because rehash() can happen with lookup(). */ struct hlist_nulls_head nulls_head; }; int count; spinlock_t lock; } __aligned(2 * sizeof(long)); /** * struct udp_hslot_main - UDP hash slot used by udp_table.hash2 * * @hslot: basic hash slot * @hash4_cnt: number of sockets in hslot4 of the same * (local port, local address) */ struct udp_hslot_main { struct udp_hslot hslot; /* must be the first member */ #if !IS_ENABLED(CONFIG_BASE_SMALL) u32 hash4_cnt; #endif } __aligned(2 * sizeof(long)); #define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot)) /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) * @hash4: hash table, connected sockets are hashed on * (local port, local address, remote port, remote address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot_main *hash2; #if !IS_ENABLED(CONFIG_BASE_SMALL) struct udp_hslot *hash4; #endif unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, const struct net *net, unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() */ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { return &table->hash2[hash & table->mask].hslot; } #if IS_ENABLED(CONFIG_BASE_SMALL) static inline void udp_table_hash4_init(struct udp_table *table) { } static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, unsigned int hash) { BUILD_BUG(); return NULL; } static inline bool udp_hashed4(const struct sock *sk) { return false; } static inline unsigned int udp_hash4_slot_size(void) { return 0; } static inline bool udp_has_hash4(const struct udp_hslot *hslot2) { return false; } static inline void udp_hash4_inc(struct udp_hslot *hslot2) { } static inline void udp_hash4_dec(struct udp_hslot *hslot2) { } #else /* !CONFIG_BASE_SMALL */ /* Must be called with table->hash2 initialized */ static inline void udp_table_hash4_init(struct udp_table *table) { table->hash4 = (void *)(table->hash2 + (table->mask + 1)); for (int i = 0; i <= table->mask; i++) { table->hash2[i].hash4_cnt = 0; INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i); table->hash4[i].count = 0; spin_lock_init(&table->hash4[i].lock); } } static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, unsigned int hash) { return &table->hash4[hash & table->mask]; } static inline bool udp_hashed4(const struct sock *sk) { return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node); } static inline unsigned int udp_hash4_slot_size(void) { return sizeof(struct udp_hslot); } static inline bool udp_has_hash4(const struct udp_hslot *hslot2) { return UDP_HSLOT_MAIN(hslot2)->hash4_cnt; } static inline void udp_hash4_inc(struct udp_hslot *hslot2) { UDP_HSLOT_MAIN(hslot2)->hash4_cnt++; } static inline void udp_hash4_dec(struct udp_hslot *hslot2) { UDP_HSLOT_MAIN(hslot2)->hash4_cnt--; } #endif /* CONFIG_BASE_SMALL */ extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; struct sk_buff; /* * Generic checksumming routines for UDP(-Lite) v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { return (UDP_SKB_CB(skb)->cscov == skb->len ? __skb_checksum_complete(skb) : __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __udp_lib_checksum_complete(skb); } /** * udp_csum_outgoing - compute UDPv4/v6 checksum over fragments * @sk: socket we are writing to * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) */ static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), 0); skb_queue_walk(&sk->sk_write_queue, skb) { csum = csum_add(csum, skb->csum); } return csum; } static inline __wsum udp_csum(struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), skb->csum); for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) { csum = csum_add(csum, skb->csum); } return csum; } static inline __sum16 udp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); static inline void udp_csum_pull_header(struct sk_buff *skb) { if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, __be16 dport); void udp_v6_early_demux(struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); static inline void udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); return 0; } void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } /* hash4 routines shared between UDPv4/6 */ #if IS_ENABLED(CONFIG_BASE_SMALL) static inline void udp_lib_hash4(struct sock *sk, u16 hash) { } static inline void udp4_hash4(struct sock *sk) { } #else /* !CONFIG_BASE_SMALL */ void udp_lib_hash4(struct sock *sk, u16 hash); void udp4_hash4(struct sock *sk); #endif /* CONFIG_BASE_SMALL */ int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, int min, int max, bool use_eth) { u32 hash; if (min >= max) { /* Use default range */ inet_get_local_port_range(net, &min, &max); } hash = skb_get_hash(skb); if (unlikely(!hash)) { if (use_eth) { /* Can't find a normal hash, caller has indicated an * Ethernet packet so use that to compute a hash. */ hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); } else { /* Can't derive any sort of hash for the packet, set * to some consistent random value. */ hash = udp_flow_hashrnd(); } } /* Since this is being sent on the wire obfuscate hash a bit * to minimize possibility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ hash ^= hash << 16; return htons((((u64) hash * (max - min)) >> 32) + min); } static inline int udp_rqueue_get(struct sock *sk) { return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } static inline bool udp_sk_bound_dev_eq(const struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); #endif } /* net/ipv4/udp.c */ void udp_destruct_common(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int *err) { int off = 0; return __skb_recv_udp(sk, flags, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, int *karg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ struct udp_dev_scratch { /* skb->truesize and the stateless bit are embedded in a single field; * do not use a bitfield since the compiler emits better/smaller code * this way */ u32 _tsize_state; #if BITS_PER_LONG == 64 /* len and the bit needed to compute skb_csum_unnecessary * will be on cold cache lines at recvmsg time. * skb->len can be stored on 16 bits since the udp header has been * already validated and pulled. */ u16 len; bool is_linear; bool csum_unnecessary; #endif }; static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) { return (struct udp_dev_scratch *)&skb->dev_scratch; } #if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return udp_skb_scratch(skb)->is_linear; } #else static inline unsigned int udp_skb_len(struct sk_buff *skb) { return skb->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return skb_csum_unnecessary(skb); } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return !skb_is_nonlinear(skb); } #endif static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT; } /* * SNMP statistics for UDP and UDP-Lite */ #define UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP6_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\ else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #define UDP6_INC_STATS(net, field, __lite) do { \ if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #if IS_ENABLED(CONFIG_IPV6) #define __UDPX_MIB(sk, ipv4) \ ({ \ ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics) : \ (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ sock_net(sk)->mib.udp_stats_in6); \ }) #else #define __UDPX_MIB(sk, ipv4) \ ({ \ IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics; \ }) #endif #define __UDPX_INC_STATS(sk, field) \ __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field) #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; struct udp_table *udp_table; }; struct udp_iter_state { struct seq_net_private p; int bucket; }; void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); extern const struct seq_operations udp_seq_ops; extern const struct seq_operations udp6_seq_ops; int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ int udpv4_offload_init(void); void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); void udp_encap_disable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); #endif static inline struct sk_buff *udp_rcv_segment(struct sock *sk, struct sk_buff *skb, bool ipv4) { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values */ if (!inet_get_convert_csum(sk)) features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. * Reset in this specific case, where PARTIAL is both correct and * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { int segs_nr = skb_shinfo(skb)->gso_segs; atomic_add(segs_nr, &sk->sk_drops); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr); kfree_skb(skb); return NULL; } consume_skb(skb); return segs; } static inline void udp_post_segment_fix_csum(struct sk_buff *skb) { /* UDP-lite can't land here - no GRO */ WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov); /* UDP packets generated with UDP_SEGMENT and traversing: * * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) * * can reach an UDP socket with CHECKSUM_NONE, because * __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE. * SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will * have a valid checksum, as the GRO engine validates the UDP csum * before the aggregation and nobody strips such info in between. * Instead of adding another check in the tunnel fastpath, we can force * a valid csum after the segmentation. * Additionally fixup the UDP CB. */ UDP_SKB_CB(skb)->cscov = skb->len; if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid) skb->csum_valid = 1; } #ifdef CONFIG_BPF_SYSCALL struct sk_psock; int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif #endif /* _UDP_H */
55 5 134 133 6 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: GPL-2.0 */ /* * This header is used to share core functionality between the * standalone connection tracking module, and the compatibility layer's use * of connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_core.h */ #ifndef _NF_CONNTRACK_CORE_H #define _NF_CONNTRACK_CORE_H #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_l4proto.h> /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state); int nf_conntrack_init_net(struct net *net); void nf_conntrack_cleanup_net(struct net *net); void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list); void nf_conntrack_proto_pernet_init(struct net *net); int nf_conntrack_proto_init(void); void nf_conntrack_proto_fini(void); int nf_conntrack_init_start(void); void nf_conntrack_cleanup_start(void); void nf_conntrack_init_end(void); void nf_conntrack_cleanup_end(void); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig); /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); int __nf_conntrack_confirm(struct sk_buff *skb); /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int nf_conntrack_confirm(struct sk_buff *skb) { struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb); int ret = NF_ACCEPT; if (ct) { if (!nf_ct_is_confirmed(ct)) { ret = __nf_conntrack_confirm(skb); if (ret == NF_ACCEPT) ct = (struct nf_conn *)skb_nfct(skb); } if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct)) nf_ct_deliver_cached_events(ct); } return ret; } unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); #define CONNTRACK_LOCKS 1024 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) { if (timeout > INT_MAX) timeout = INT_MAX; if (nf_ct_is_confirmed(ct)) WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); else ct->timeout = (u32)timeout; } int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status); #endif /* _NF_CONNTRACK_CORE_H */
2 1 1 2 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip6_fib.h> #include <net/ip6_checksum.h> #include <net/netfilter/ipv6/nf_reject.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int thoff; __be16 fo; u8 proto = ip6h->nexthdr; if (skb_csum_unnecessary(skb)) return true; if (ip6h->payload_len && pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) return false; ip6h = ipv6_hdr(skb); thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; if (!nf_reject_verify_csum(skb, thoff, proto)) return true; return nf_ip6_checksum(skb, hook, thoff, proto) == 0; } static int nf_reject_ip6hdr_validate(struct sk_buff *skb) { struct ipv6hdr *hdr; u32 pkt_len; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return 0; hdr = ipv6_hdr(skb); if (hdr->version != 6) return 0; pkt_len = ntohs(hdr->payload_len); if (pkt_len + sizeof(struct ipv6hdr) > skb->len) return 0; return 1; } struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook) { struct sk_buff *nskb; const struct tcphdr *oth; struct tcphdr _oth; unsigned int otcplen; struct ipv6hdr *nip6h; if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook); if (!oth) return NULL; nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) return NULL; nskb->dev = (struct net_device *)dev; skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, READ_ONCE(net->ipv6.devconf_all->hop_limit)); nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); return nskb; } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook, u8 code) { struct sk_buff *nskb; struct ipv6hdr *nip6h; struct icmp6hdr *icmp6h; unsigned int len; if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; /* Include "As much of invoking packet as possible without the ICMPv6 * packet exceeding the minimum IPv6 MTU" in the ICMP payload. */ len = min_t(unsigned int, 1220, oldskb->len); if (!pskb_may_pull(oldskb, len)) return NULL; if (!nf_reject_v6_csum_ok(oldskb, hook)) return NULL; nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + LL_MAX_HEADER + len, GFP_ATOMIC); if (!nskb) return NULL; nskb->dev = (struct net_device *)dev; skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, READ_ONCE(net->ipv6.devconf_all->hop_limit)); skb_reset_transport_header(nskb); icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); icmp6h->icmp6_type = ICMPV6_DEST_UNREACH; icmp6h->icmp6_code = code; skb_put_data(nskb, skb_network_header(oldskb), len); nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, nskb->len - sizeof(struct ipv6hdr), IPPROTO_ICMPV6, csum_partial(icmp6h, nskb->len - sizeof(struct ipv6hdr), 0)); return nskb; } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; __be16 frag_off; int tcphoff; proto = oip6h->nexthdr; tcphoff = ipv6_skip_exthdr(oldskb, ((u8 *)(oip6h + 1) - oldskb->data), &proto, &frag_off); if ((tcphoff < 0) || (tcphoff > oldskb->len)) { pr_debug("Cannot get TCP header.\n"); return NULL; } *otcplen = oldskb->len - tcphoff; /* IP header checks: fragment, too short. */ if (proto != IPPROTO_TCP || *otcplen < sizeof(struct tcphdr)) { pr_debug("proto(%d) != IPPROTO_TCP or too short (len = %d)\n", proto, *otcplen); return NULL; } otcph = skb_header_pointer(oldskb, tcphoff, sizeof(struct tcphdr), otcph); if (otcph == NULL) return NULL; /* No RST for RST. */ if (otcph->rst) { pr_debug("RST is set\n"); return NULL; } /* Check checksum. */ if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) { pr_debug("TCP checksum is invalid\n"); return NULL; } return otcph; } EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); #define DEFAULT_TOS_VALUE 0x0U const __u8 tclass = DEFAULT_TOS_VALUE; skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); ip6_flow_hdr(ip6h, tclass, 0); ip6h->hop_limit = hoplimit; ip6h->nexthdr = protocol; ip6h->saddr = oip6h->daddr; ip6h->daddr = oip6h->saddr; nskb->protocol = htons(ETH_P_IPV6); return ip6h; } EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; skb_reset_transport_header(nskb); tcph = skb_put_zero(nskb, sizeof(struct tcphdr)); /* Truncate to length (no data) */ tcph->doff = sizeof(struct tcphdr)/4; tcph->source = oth->dest; tcph->dest = oth->source; if (oth->ack) { tcph->seq = oth->ack_seq; } else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + otcplen - (oth->doff<<2)); tcph->ack = 1; } tcph->rst = 1; /* Adjust TCP checksum */ tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, &ipv6_hdr(nskb)->daddr, sizeof(struct tcphdr), IPPROTO_TCP, csum_partial(tcph, sizeof(struct tcphdr), 0)); } EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { struct dst_entry *dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(struct flowi)); fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr; nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false); if (!dst) return -1; skb_dst_set(skb_in, dst); return 0; } void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); struct dst_entry *dst = NULL; const struct tcphdr *otcph; struct sk_buff *nskb; struct tcphdr _otcph; unsigned int otcplen; struct flowi6 fl6; if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { pr_debug("addr is not unicast.\n"); return; } otcph = nf_reject_ip6_tcphdr_get(oldskb, &_otcph, &otcplen, hook); if (!otcph) return; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.saddr = oip6h->daddr; fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; skb_dst_set(oldskb, dst); } fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; nskb = alloc_skb(LL_MAX_HEADER + sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + dst->trailer_len, GFP_ATOMIC); if (!nskb) { net_dbg_ratelimited("cannot alloc skb\n"); dst_release(dst); return; } skb_dst_set(nskb, dst); nskb->mark = fl6.flowi6_mark; skb_reserve(nskb, LL_MAX_HEADER); nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); struct ipv6hdr *ip6h = ipv6_hdr(nskb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(oldskb, net); if (!br_indev) { kfree_skb(nskb); return; } nskb->dev = br_indev; nskb->protocol = htons(ETH_P_IPV6); ip6h->payload_len = htons(sizeof(struct tcphdr)); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), oeth->h_source, oeth->h_dest, nskb->len) < 0) { kfree_skb(nskb); return; } dev_queue_xmit(nskb); } else #endif ip6_local_out(net, sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); static bool reject6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int thoff; __be16 fo; u8 proto; if (skb_csum_unnecessary(skb)) return true; proto = ip6h->nexthdr; thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; if (!nf_reject_verify_csum(skb, thoff, proto)) return true; return nf_ip6_checksum(skb, hook, thoff, proto) == 0; } void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char code, unsigned int hooknum) { if (!reject6_csum_ok(skb_in, hooknum)) return; if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); } EXPORT_SYMBOL_GPL(nf_send_unreach6); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6 packet rejection core");
20 20 20 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 /* * linux/fs/nls/nls_ascii.c * * Charset ascii translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ }; static const unsigned char *const page_uni2charset[256] = { page00, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "ascii", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_ascii(void) { return register_nls(&table); } static void __exit exit_nls_ascii(void) { unregister_nls(&table); } module_init(init_nls_ascii) module_exit(exit_nls_ascii) MODULE_DESCRIPTION("NLS ASCII (United States)"); MODULE_LICENSE("Dual BSD/GPL");
49 98 97 30 1 49 30 49 49 97 97 30 2 26 49 48 49 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); u64_stats_add(&stats->bytes, skb->len); u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; gnet_stats_basic_sync_init(&est->bstats); strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg[] __read_mostly = { { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV4, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV6, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #endif }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini);
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-only /* xfrm4_tunnel.c: Generic IP tunnel transformer. * * Copyright (C) 2003 David S. Miller (davem@redhat.com) */ #define pr_fmt(fmt) "IPsec: " fmt #include <linux/skbuff.h> #include <linux/module.h> #include <net/xfrm.h> #include <net/protocol.h> static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb) { return ip_hdr(skb)->protocol; } static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->props.mode != XFRM_MODE_TUNNEL) { NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode"); return -EINVAL; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation"); return -EINVAL; } x->props.header_len = sizeof(struct iphdr); return 0; } static void ipip_destroy(struct xfrm_state *x) { } static const struct xfrm_type ipip_type = { .owner = THIS_MODULE, .proto = IPPROTO_IPIP, .init_state = ipip_init_state, .destructor = ipip_destroy, .input = ipip_xfrm_rcv, .output = ipip_output }; static int xfrm_tunnel_rcv(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) { return -ENOENT; } static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 4, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 3, }; #endif static int __init ipip_init(void) { if (xfrm_register_type(&ipip_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) { pr_info("%s: can't add xfrm handler for AF_INET\n", __func__); xfrm_unregister_type(&ipip_type, AF_INET); return -EAGAIN; } #if IS_ENABLED(CONFIG_IPV6) if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__); xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET); xfrm_unregister_type(&ipip_type, AF_INET); return -EAGAIN; } #endif return 0; } static void __exit ipip_fini(void) { #if IS_ENABLED(CONFIG_IPV6) if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) pr_info("%s: can't remove xfrm handler for AF_INET6\n", __func__); #endif if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) pr_info("%s: can't remove xfrm handler for AF_INET\n", __func__); xfrm_unregister_type(&ipip_type, AF_INET); } module_init(ipip_init); module_exit(ipip_fini); MODULE_DESCRIPTION("IPv4 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);
51 1 1 66 66 51 37 37 51 1 1 1 51 1 1 62 15 51 51 1 51 51 51 51 37 9 37 9 21 1 1 1 3 1 2 2 2 2 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 // SPDX-License-Identifier: GPL-2.0-only /* * slip.c This module implements the SLIP protocol for kernel-based * devices like TTY. It interfaces between a raw TTY, and the * kernel's INET protocol layers. * * Version: @(#)slip.c 0.8.3 12/24/94 * * Authors: Laurence Culhane, <loz@holmes.demon.co.uk> * Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org> * * Fixes: * Alan Cox : Sanity checks and avoid tx overruns. * Has a new sl->mtu field. * Alan Cox : Found cause of overrun. ifconfig sl0 * mtu upwards. Driver now spots this * and grows/shrinks its buffers(hack!). * Memory leak if you run out of memory * setting up a slip driver fixed. * Matt Dillon : Printable slip (borrowed from NET2E) * Pauline Middelink : Slip driver fixes. * Alan Cox : Honours the old SL_COMPRESSED flag * Alan Cox : KISS AX.25 and AXUI IP support * Michael Riepe : Automatic CSLIP recognition added * Charles Hedrick : CSLIP header length problem fix. * Alan Cox : Corrected non-IP cases of the above. * Alan Cox : Now uses hardware type as per FvK. * Alan Cox : Default to 192.168.0.0 (RFC 1597) * A.N.Kuznetsov : dev_tint() recursion fix. * Dmitry Gorodchanin : SLIP memory leaks * Dmitry Gorodchanin : Code cleanup. Reduce tty driver * buffering from 4096 to 256 bytes. * Improving SLIP response time. * CONFIG_SLIP_MODE_SLIP6. * ifconfig sl? up & down now works * correctly. * Modularization. * Alan Cox : Oops - fix AX.25 buffer lengths * Dmitry Gorodchanin : Even more cleanups. Preserve CSLIP * statistics. Include CSLIP code only * if it really needed. * Alan Cox : Free slhc buffers in the right place. * Alan Cox : Allow for digipeated IP over AX.25 * Matti Aarnio : Dynamic SLIP devices, with ideas taken * from Jim Freeman's <jfree@caldera.com> * dynamic PPP devices. We do NOT kfree() * device entries, just reg./unreg. them * as they are needed. We kfree() them * at module cleanup. * With MODULE-loading ``insmod'', user * can issue parameter: slip_maxdev=1024 * (Or how much he/she wants.. Default * is 256) * Stanislav Voronyi : Slip line checking, with ideas taken * from multislip BSDI driver which was * written by Igor Chechik, RELCOM Corp. * Only algorithms have been ported to * Linux SLIP driver. * Vitaly E. Lavrov : Sane behaviour on tty hangup. * Alexey Kuznetsov : Cleanup interfaces to tty & netdevice * modules. */ #define SL_CHECK_TRANSMIT #include <linux/compat.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/if_slip.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include "slip.h" #ifdef CONFIG_INET #include <linux/ip.h> #include <linux/tcp.h> #include <net/slhc_vj.h> #endif #define SLIP_VERSION "0.8.4-NET3.019-NEWTTY" static struct net_device **slip_devs; static int slip_maxdev = SL_NRUNIT; module_param(slip_maxdev, int, 0); MODULE_PARM_DESC(slip_maxdev, "Maximum number of slip devices"); static int slip_esc(unsigned char *p, unsigned char *d, int len); static void slip_unesc(struct slip *sl, unsigned char c); #ifdef CONFIG_SLIP_MODE_SLIP6 static int slip_esc6(unsigned char *p, unsigned char *d, int len); static void slip_unesc6(struct slip *sl, unsigned char c); #endif #ifdef CONFIG_SLIP_SMART static void sl_keepalive(struct timer_list *t); static void sl_outfill(struct timer_list *t); static int sl_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd); #endif /******************************** * Buffer administration routines: * sl_alloc_bufs() * sl_free_bufs() * sl_realloc_bufs() * * NOTE: sl_realloc_bufs != sl_free_bufs + sl_alloc_bufs, because * sl_realloc_bufs provides strong atomicity and reallocation * on actively running device. *********************************/ /* Allocate channel buffers. */ static int sl_alloc_bufs(struct slip *sl, int mtu) { int err = -ENOBUFS; unsigned long len; char *rbuff = NULL; char *xbuff = NULL; #ifdef SL_INCLUDE_CSLIP char *cbuff = NULL; struct slcompress *slcomp = NULL; #endif /* * Allocate the SLIP frame buffers: * * rbuff Receive buffer. * xbuff Transmit buffer. * cbuff Temporary compression buffer. */ len = mtu * 2; /* * allow for arrival of larger UDP packets, even if we say not to * also fixes a bug in which SunOS sends 512-byte packets even with * an MSS of 128 */ if (len < 576 * 2) len = 576 * 2; rbuff = kmalloc(len + 4, GFP_KERNEL); if (rbuff == NULL) goto err_exit; xbuff = kmalloc(len + 4, GFP_KERNEL); if (xbuff == NULL) goto err_exit; #ifdef SL_INCLUDE_CSLIP cbuff = kmalloc(len + 4, GFP_KERNEL); if (cbuff == NULL) goto err_exit; slcomp = slhc_init(16, 16); if (IS_ERR(slcomp)) goto err_exit; #endif spin_lock_bh(&sl->lock); if (sl->tty == NULL) { spin_unlock_bh(&sl->lock); err = -ENODEV; goto err_exit; } sl->mtu = mtu; sl->buffsize = len; sl->rcount = 0; sl->xleft = 0; rbuff = xchg(&sl->rbuff, rbuff); xbuff = xchg(&sl->xbuff, xbuff); #ifdef SL_INCLUDE_CSLIP cbuff = xchg(&sl->cbuff, cbuff); slcomp = xchg(&sl->slcomp, slcomp); #endif #ifdef CONFIG_SLIP_MODE_SLIP6 sl->xdata = 0; sl->xbits = 0; #endif spin_unlock_bh(&sl->lock); err = 0; /* Cleanup */ err_exit: #ifdef SL_INCLUDE_CSLIP kfree(cbuff); slhc_free(slcomp); #endif kfree(xbuff); kfree(rbuff); return err; } /* Free a SLIP channel buffers. */ static void sl_free_bufs(struct slip *sl) { /* Free all SLIP frame buffers. */ kfree(xchg(&sl->rbuff, NULL)); kfree(xchg(&sl->xbuff, NULL)); #ifdef SL_INCLUDE_CSLIP kfree(xchg(&sl->cbuff, NULL)); slhc_free(xchg(&sl->slcomp, NULL)); #endif } /* Reallocate slip channel buffers. */ static int sl_realloc_bufs(struct slip *sl, int mtu) { int err = 0; struct net_device *dev = sl->dev; unsigned char *xbuff, *rbuff; #ifdef SL_INCLUDE_CSLIP unsigned char *cbuff; #endif int len = mtu * 2; /* * allow for arrival of larger UDP packets, even if we say not to * also fixes a bug in which SunOS sends 512-byte packets even with * an MSS of 128 */ if (len < 576 * 2) len = 576 * 2; xbuff = kmalloc(len + 4, GFP_ATOMIC); rbuff = kmalloc(len + 4, GFP_ATOMIC); #ifdef SL_INCLUDE_CSLIP cbuff = kmalloc(len + 4, GFP_ATOMIC); #endif #ifdef SL_INCLUDE_CSLIP if (xbuff == NULL || rbuff == NULL || cbuff == NULL) { #else if (xbuff == NULL || rbuff == NULL) { #endif if (mtu > sl->mtu) { printk(KERN_WARNING "%s: unable to grow slip buffers, MTU change cancelled.\n", dev->name); err = -ENOBUFS; } goto done; } spin_lock_bh(&sl->lock); err = -ENODEV; if (sl->tty == NULL) goto done_on_bh; xbuff = xchg(&sl->xbuff, xbuff); rbuff = xchg(&sl->rbuff, rbuff); #ifdef SL_INCLUDE_CSLIP cbuff = xchg(&sl->cbuff, cbuff); #endif if (sl->xleft) { if (sl->xleft <= len) { memcpy(sl->xbuff, sl->xhead, sl->xleft); } else { sl->xleft = 0; dev->stats.tx_dropped++; } } sl->xhead = sl->xbuff; if (sl->rcount) { if (sl->rcount <= len) { memcpy(sl->rbuff, rbuff, sl->rcount); } else { sl->rcount = 0; dev->stats.rx_over_errors++; set_bit(SLF_ERROR, &sl->flags); } } sl->mtu = mtu; WRITE_ONCE(dev->mtu, mtu); sl->buffsize = len; err = 0; done_on_bh: spin_unlock_bh(&sl->lock); done: kfree(xbuff); kfree(rbuff); #ifdef SL_INCLUDE_CSLIP kfree(cbuff); #endif return err; } /* Set the "sending" flag. This must be atomic hence the set_bit. */ static inline void sl_lock(struct slip *sl) { netif_stop_queue(sl->dev); } /* Clear the "sending" flag. This must be atomic, hence the ASM. */ static inline void sl_unlock(struct slip *sl) { netif_wake_queue(sl->dev); } /* Send one completely decapsulated IP datagram to the IP layer. */ static void sl_bump(struct slip *sl) { struct net_device *dev = sl->dev; struct sk_buff *skb; int count; count = sl->rcount; #ifdef SL_INCLUDE_CSLIP if (sl->mode & (SL_MODE_ADAPTIVE | SL_MODE_CSLIP)) { unsigned char c = sl->rbuff[0]; if (c & SL_TYPE_COMPRESSED_TCP) { /* ignore compressed packets when CSLIP is off */ if (!(sl->mode & SL_MODE_CSLIP)) { printk(KERN_WARNING "%s: compressed packet ignored\n", dev->name); return; } /* make sure we've reserved enough space for uncompress to use */ if (count + 80 > sl->buffsize) { dev->stats.rx_over_errors++; return; } count = slhc_uncompress(sl->slcomp, sl->rbuff, count); if (count <= 0) return; } else if (c >= SL_TYPE_UNCOMPRESSED_TCP) { if (!(sl->mode & SL_MODE_CSLIP)) { /* turn on header compression */ sl->mode |= SL_MODE_CSLIP; sl->mode &= ~SL_MODE_ADAPTIVE; printk(KERN_INFO "%s: header compression turned on\n", dev->name); } sl->rbuff[0] &= 0x4f; if (slhc_remember(sl->slcomp, sl->rbuff, count) <= 0) return; } } #endif /* SL_INCLUDE_CSLIP */ dev->stats.rx_bytes += count; skb = dev_alloc_skb(count); if (skb == NULL) { printk(KERN_WARNING "%s: memory squeeze, dropping packet.\n", dev->name); dev->stats.rx_dropped++; return; } skb->dev = dev; skb_put_data(skb, sl->rbuff, count); skb_reset_mac_header(skb); skb->protocol = htons(ETH_P_IP); netif_rx(skb); dev->stats.rx_packets++; } /* Encapsulate one IP datagram and stuff into a TTY queue. */ static void sl_encaps(struct slip *sl, unsigned char *icp, int len) { unsigned char *p; int actual, count; if (len > sl->mtu) { /* Sigh, shouldn't occur BUT ... */ printk(KERN_WARNING "%s: truncating oversized transmit packet!\n", sl->dev->name); sl->dev->stats.tx_dropped++; sl_unlock(sl); return; } p = icp; #ifdef SL_INCLUDE_CSLIP if (sl->mode & SL_MODE_CSLIP) len = slhc_compress(sl->slcomp, p, len, sl->cbuff, &p, 1); #endif #ifdef CONFIG_SLIP_MODE_SLIP6 if (sl->mode & SL_MODE_SLIP6) count = slip_esc6(p, sl->xbuff, len); else #endif count = slip_esc(p, sl->xbuff, len); /* Order of next two lines is *very* important. * When we are sending a little amount of data, * the transfer may be completed inside the ops->write() * routine, because it's running with interrupts enabled. * In this case we *never* got WRITE_WAKEUP event, * if we did not request it before write operation. * 14 Oct 1994 Dmitry Gorodchanin. */ set_bit(TTY_DO_WRITE_WAKEUP, &sl->tty->flags); actual = sl->tty->ops->write(sl->tty, sl->xbuff, count); #ifdef SL_CHECK_TRANSMIT netif_trans_update(sl->dev); #endif sl->xleft = count - actual; sl->xhead = sl->xbuff + actual; #ifdef CONFIG_SLIP_SMART /* VSV */ clear_bit(SLF_OUTWAIT, &sl->flags); /* reset outfill flag */ #endif } /* Write out any remaining transmit buffer. Scheduled when tty is writable */ static void slip_transmit(struct work_struct *work) { struct slip *sl = container_of(work, struct slip, tx_work); int actual; spin_lock_bh(&sl->lock); /* First make sure we're connected. */ if (!sl->tty || sl->magic != SLIP_MAGIC || !netif_running(sl->dev)) { spin_unlock_bh(&sl->lock); return; } if (sl->xleft <= 0) { /* Now serial buffer is almost free & we can start * transmission of another packet */ sl->dev->stats.tx_packets++; clear_bit(TTY_DO_WRITE_WAKEUP, &sl->tty->flags); spin_unlock_bh(&sl->lock); sl_unlock(sl); return; } actual = sl->tty->ops->write(sl->tty, sl->xhead, sl->xleft); sl->xleft -= actual; sl->xhead += actual; spin_unlock_bh(&sl->lock); } /* * Called by the driver when there's room for more data. * Schedule the transmit. */ static void slip_write_wakeup(struct tty_struct *tty) { struct slip *sl; rcu_read_lock(); sl = rcu_dereference(tty->disc_data); if (sl) schedule_work(&sl->tx_work); rcu_read_unlock(); } static void sl_tx_timeout(struct net_device *dev, unsigned int txqueue) { struct slip *sl = netdev_priv(dev); spin_lock(&sl->lock); if (netif_queue_stopped(dev)) { if (!netif_running(dev) || !sl->tty) goto out; /* May be we must check transmitter timeout here ? * 14 Oct 1994 Dmitry Gorodchanin. */ #ifdef SL_CHECK_TRANSMIT if (time_before(jiffies, dev_trans_start(dev) + 20 * HZ)) { /* 20 sec timeout not reached */ goto out; } printk(KERN_WARNING "%s: transmit timed out, %s?\n", dev->name, (tty_chars_in_buffer(sl->tty) || sl->xleft) ? "bad line quality" : "driver error"); sl->xleft = 0; clear_bit(TTY_DO_WRITE_WAKEUP, &sl->tty->flags); sl_unlock(sl); #endif } out: spin_unlock(&sl->lock); } /* Encapsulate an IP datagram and kick it into a TTY queue. */ static netdev_tx_t sl_xmit(struct sk_buff *skb, struct net_device *dev) { struct slip *sl = netdev_priv(dev); spin_lock(&sl->lock); if (!netif_running(dev)) { spin_unlock(&sl->lock); printk(KERN_WARNING "%s: xmit call when iface is down\n", dev->name); dev_kfree_skb(skb); return NETDEV_TX_OK; } if (sl->tty == NULL) { spin_unlock(&sl->lock); dev_kfree_skb(skb); return NETDEV_TX_OK; } sl_lock(sl); dev->stats.tx_bytes += skb->len; sl_encaps(sl, skb->data, skb->len); spin_unlock(&sl->lock); dev_kfree_skb(skb); return NETDEV_TX_OK; } /****************************************** * Routines looking at netdevice side. ******************************************/ /* Netdevice UP -> DOWN routine */ static int sl_close(struct net_device *dev) { struct slip *sl = netdev_priv(dev); spin_lock_bh(&sl->lock); if (sl->tty) /* TTY discipline is running. */ clear_bit(TTY_DO_WRITE_WAKEUP, &sl->tty->flags); netif_stop_queue(dev); sl->rcount = 0; sl->xleft = 0; spin_unlock_bh(&sl->lock); return 0; } /* Netdevice DOWN -> UP routine */ static int sl_open(struct net_device *dev) { struct slip *sl = netdev_priv(dev); if (sl->tty == NULL) return -ENODEV; sl->flags &= (1 << SLF_INUSE); netif_start_queue(dev); return 0; } /* Netdevice change MTU request */ static int sl_change_mtu(struct net_device *dev, int new_mtu) { struct slip *sl = netdev_priv(dev); return sl_realloc_bufs(sl, new_mtu); } /* Netdevice get statistics request */ static void sl_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct net_device_stats *devstats = &dev->stats; #ifdef SL_INCLUDE_CSLIP struct slip *sl = netdev_priv(dev); struct slcompress *comp = sl->slcomp; #endif stats->rx_packets = devstats->rx_packets; stats->tx_packets = devstats->tx_packets; stats->rx_bytes = devstats->rx_bytes; stats->tx_bytes = devstats->tx_bytes; stats->rx_dropped = devstats->rx_dropped; stats->tx_dropped = devstats->tx_dropped; stats->tx_errors = devstats->tx_errors; stats->rx_errors = devstats->rx_errors; stats->rx_over_errors = devstats->rx_over_errors; #ifdef SL_INCLUDE_CSLIP if (comp) { /* Generic compressed statistics */ stats->rx_compressed = comp->sls_i_compressed; stats->tx_compressed = comp->sls_o_compressed; /* Are we really still needs this? */ stats->rx_fifo_errors += comp->sls_i_compressed; stats->rx_dropped += comp->sls_i_tossed; stats->tx_fifo_errors += comp->sls_o_compressed; stats->collisions += comp->sls_o_misses; } #endif } /* Netdevice register callback */ static int sl_init(struct net_device *dev) { struct slip *sl = netdev_priv(dev); /* * Finish setting up the DEVICE info. */ dev->mtu = sl->mtu; dev->type = ARPHRD_SLIP + sl->mode; #ifdef SL_CHECK_TRANSMIT dev->watchdog_timeo = 20*HZ; #endif return 0; } static void sl_uninit(struct net_device *dev) { struct slip *sl = netdev_priv(dev); sl_free_bufs(sl); } /* Hook the destructor so we can free slip devices at the right point in time */ static void sl_free_netdev(struct net_device *dev) { int i = dev->base_addr; slip_devs[i] = NULL; } static const struct net_device_ops sl_netdev_ops = { .ndo_init = sl_init, .ndo_uninit = sl_uninit, .ndo_open = sl_open, .ndo_stop = sl_close, .ndo_start_xmit = sl_xmit, .ndo_get_stats64 = sl_get_stats64, .ndo_change_mtu = sl_change_mtu, .ndo_tx_timeout = sl_tx_timeout, #ifdef CONFIG_SLIP_SMART .ndo_siocdevprivate = sl_siocdevprivate, #endif }; static void sl_setup(struct net_device *dev) { dev->netdev_ops = &sl_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = sl_free_netdev; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 10; /* MTU range: 68 - 65534 */ dev->min_mtu = 68; dev->max_mtu = 65534; /* New-style flags. */ dev->flags = IFF_NOARP|IFF_POINTOPOINT|IFF_MULTICAST; } /****************************************** Routines looking at TTY side. ******************************************/ /* * Handle the 'receiver data ready' interrupt. * This function is called by the 'tty_io' module in the kernel when * a block of SLIP data has been received, which can now be decapsulated * and sent on to some IP layer for further processing. This will not * be re-entered while running but other ldisc functions may be called * in parallel */ static void slip_receive_buf(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { struct slip *sl = tty->disc_data; if (!sl || sl->magic != SLIP_MAGIC || !netif_running(sl->dev)) return; /* Read the characters out of the buffer */ while (count--) { if (fp && *fp++) { if (!test_and_set_bit(SLF_ERROR, &sl->flags)) sl->dev->stats.rx_errors++; cp++; continue; } #ifdef CONFIG_SLIP_MODE_SLIP6 if (sl->mode & SL_MODE_SLIP6) slip_unesc6(sl, *cp++); else #endif slip_unesc(sl, *cp++); } } /************************************ * slip_open helper routines. ************************************/ /* Collect hanged up channels */ static void sl_sync(void) { int i; struct net_device *dev; struct slip *sl; for (i = 0; i < slip_maxdev; i++) { dev = slip_devs[i]; if (dev == NULL) break; sl = netdev_priv(dev); if (sl->tty || sl->leased) continue; if (dev->flags & IFF_UP) dev_close(dev); } } /* Find a free SLIP channel, and link in this `tty' line. */ static struct slip *sl_alloc(void) { int i; char name[IFNAMSIZ]; struct net_device *dev = NULL; struct slip *sl; for (i = 0; i < slip_maxdev; i++) { dev = slip_devs[i]; if (dev == NULL) break; } /* Sorry, too many, all slots in use */ if (i >= slip_maxdev) return NULL; sprintf(name, "sl%d", i); dev = alloc_netdev(sizeof(*sl), name, NET_NAME_UNKNOWN, sl_setup); if (!dev) return NULL; dev->base_addr = i; sl = netdev_priv(dev); /* Initialize channel control data */ sl->magic = SLIP_MAGIC; sl->dev = dev; spin_lock_init(&sl->lock); INIT_WORK(&sl->tx_work, slip_transmit); sl->mode = SL_MODE_DEFAULT; #ifdef CONFIG_SLIP_SMART /* initialize timer_list struct */ timer_setup(&sl->keepalive_timer, sl_keepalive, 0); timer_setup(&sl->outfill_timer, sl_outfill, 0); #endif slip_devs[i] = dev; return sl; } /* * Open the high-level part of the SLIP channel. * This function is called by the TTY module when the * SLIP line discipline is called for. Because we are * sure the tty line exists, we only have to link it to * a free SLIP channel... * * Called in process context serialized from other ldisc calls. */ static int slip_open(struct tty_struct *tty) { struct slip *sl; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (tty->ops->write == NULL) return -EOPNOTSUPP; /* RTnetlink lock is misused here to serialize concurrent opens of slip channels. There are better ways, but it is the simplest one. */ rtnl_lock(); /* Collect hanged up channels. */ sl_sync(); sl = tty->disc_data; err = -EEXIST; /* First make sure we're not already connected. */ if (sl && sl->magic == SLIP_MAGIC) goto err_exit; /* OK. Find a free SLIP channel to use. */ err = -ENFILE; sl = sl_alloc(); if (sl == NULL) goto err_exit; sl->tty = tty; tty->disc_data = sl; sl->pid = current->pid; if (!test_bit(SLF_INUSE, &sl->flags)) { /* Perform the low-level SLIP initialization. */ err = sl_alloc_bufs(sl, SL_MTU); if (err) goto err_free_chan; set_bit(SLF_INUSE, &sl->flags); err = register_netdevice(sl->dev); if (err) goto err_free_bufs; } #ifdef CONFIG_SLIP_SMART if (sl->keepalive) { sl->keepalive_timer.expires = jiffies + sl->keepalive * HZ; add_timer(&sl->keepalive_timer); } if (sl->outfill) { sl->outfill_timer.expires = jiffies + sl->outfill * HZ; add_timer(&sl->outfill_timer); } #endif /* Done. We have linked the TTY line to a channel. */ rtnl_unlock(); tty->receive_room = 65536; /* We don't flow control */ /* TTY layer expects 0 on success */ return 0; err_free_bufs: sl_free_bufs(sl); err_free_chan: sl->tty = NULL; tty->disc_data = NULL; clear_bit(SLF_INUSE, &sl->flags); sl_free_netdev(sl->dev); /* do not call free_netdev before rtnl_unlock */ rtnl_unlock(); free_netdev(sl->dev); return err; err_exit: rtnl_unlock(); /* Count references from TTY module */ return err; } /* * Close down a SLIP channel. * This means flushing out any pending queues, and then returning. This * call is serialized against other ldisc functions. * * We also use this method fo a hangup event */ static void slip_close(struct tty_struct *tty) { struct slip *sl = tty->disc_data; /* First make sure we're connected. */ if (!sl || sl->magic != SLIP_MAGIC || sl->tty != tty) return; spin_lock_bh(&sl->lock); rcu_assign_pointer(tty->disc_data, NULL); sl->tty = NULL; spin_unlock_bh(&sl->lock); synchronize_rcu(); flush_work(&sl->tx_work); /* VSV = very important to remove timers */ #ifdef CONFIG_SLIP_SMART timer_delete_sync(&sl->keepalive_timer); timer_delete_sync(&sl->outfill_timer); #endif /* Flush network side */ unregister_netdev(sl->dev); /* This will complete via sl_free_netdev */ } static void slip_hangup(struct tty_struct *tty) { slip_close(tty); } /************************************************************************ * STANDARD SLIP ENCAPSULATION * ************************************************************************/ static int slip_esc(unsigned char *s, unsigned char *d, int len) { unsigned char *ptr = d; unsigned char c; /* * Send an initial END character to flush out any * data that may have accumulated in the receiver * due to line noise. */ *ptr++ = END; /* * For each byte in the packet, send the appropriate * character sequence, according to the SLIP protocol. */ while (len-- > 0) { switch (c = *s++) { case END: *ptr++ = ESC; *ptr++ = ESC_END; break; case ESC: *ptr++ = ESC; *ptr++ = ESC_ESC; break; default: *ptr++ = c; break; } } *ptr++ = END; return ptr - d; } static void slip_unesc(struct slip *sl, unsigned char s) { switch (s) { case END: #ifdef CONFIG_SLIP_SMART /* drop keeptest bit = VSV */ if (test_bit(SLF_KEEPTEST, &sl->flags)) clear_bit(SLF_KEEPTEST, &sl->flags); #endif if (!test_and_clear_bit(SLF_ERROR, &sl->flags) && (sl->rcount > 2)) sl_bump(sl); clear_bit(SLF_ESCAPE, &sl->flags); sl->rcount = 0; return; case ESC: set_bit(SLF_ESCAPE, &sl->flags); return; case ESC_ESC: if (test_and_clear_bit(SLF_ESCAPE, &sl->flags)) s = ESC; break; case ESC_END: if (test_and_clear_bit(SLF_ESCAPE, &sl->flags)) s = END; break; } if (!test_bit(SLF_ERROR, &sl->flags)) { if (sl->rcount < sl->buffsize) { sl->rbuff[sl->rcount++] = s; return; } sl->dev->stats.rx_over_errors++; set_bit(SLF_ERROR, &sl->flags); } } #ifdef CONFIG_SLIP_MODE_SLIP6 /************************************************************************ * 6 BIT SLIP ENCAPSULATION * ************************************************************************/ static int slip_esc6(unsigned char *s, unsigned char *d, int len) { unsigned char *ptr = d; unsigned char c; int i; unsigned short v = 0; short bits = 0; /* * Send an initial END character to flush out any * data that may have accumulated in the receiver * due to line noise. */ *ptr++ = 0x70; /* * Encode the packet into printable ascii characters */ for (i = 0; i < len; ++i) { v = (v << 8) | s[i]; bits += 8; while (bits >= 6) { bits -= 6; c = 0x30 + ((v >> bits) & 0x3F); *ptr++ = c; } } if (bits) { c = 0x30 + ((v << (6 - bits)) & 0x3F); *ptr++ = c; } *ptr++ = 0x70; return ptr - d; } static void slip_unesc6(struct slip *sl, unsigned char s) { unsigned char c; if (s == 0x70) { #ifdef CONFIG_SLIP_SMART /* drop keeptest bit = VSV */ if (test_bit(SLF_KEEPTEST, &sl->flags)) clear_bit(SLF_KEEPTEST, &sl->flags); #endif if (!test_and_clear_bit(SLF_ERROR, &sl->flags) && (sl->rcount > 2)) sl_bump(sl); sl->rcount = 0; sl->xbits = 0; sl->xdata = 0; } else if (s >= 0x30 && s < 0x70) { sl->xdata = (sl->xdata << 6) | ((s - 0x30) & 0x3F); sl->xbits += 6; if (sl->xbits >= 8) { sl->xbits -= 8; c = (unsigned char)(sl->xdata >> sl->xbits); if (!test_bit(SLF_ERROR, &sl->flags)) { if (sl->rcount < sl->buffsize) { sl->rbuff[sl->rcount++] = c; return; } sl->dev->stats.rx_over_errors++; set_bit(SLF_ERROR, &sl->flags); } } } } #endif /* CONFIG_SLIP_MODE_SLIP6 */ /* Perform I/O control on an active SLIP channel. */ static int slip_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct slip *sl = tty->disc_data; unsigned int tmp; int __user *p = (int __user *)arg; /* First make sure we're connected. */ if (!sl || sl->magic != SLIP_MAGIC) return -EINVAL; switch (cmd) { case SIOCGIFNAME: tmp = strlen(sl->dev->name) + 1; if (copy_to_user((void __user *)arg, sl->dev->name, tmp)) return -EFAULT; return 0; case SIOCGIFENCAP: if (put_user(sl->mode, p)) return -EFAULT; return 0; case SIOCSIFENCAP: if (get_user(tmp, p)) return -EFAULT; #ifndef SL_INCLUDE_CSLIP if (tmp & (SL_MODE_CSLIP|SL_MODE_ADAPTIVE)) return -EINVAL; #else if ((tmp & (SL_MODE_ADAPTIVE | SL_MODE_CSLIP)) == (SL_MODE_ADAPTIVE | SL_MODE_CSLIP)) /* return -EINVAL; */ tmp &= ~SL_MODE_ADAPTIVE; #endif #ifndef CONFIG_SLIP_MODE_SLIP6 if (tmp & SL_MODE_SLIP6) return -EINVAL; #endif sl->mode = tmp; sl->dev->type = ARPHRD_SLIP + sl->mode; return 0; case SIOCSIFHWADDR: return -EINVAL; #ifdef CONFIG_SLIP_SMART /* VSV changes start here */ case SIOCSKEEPALIVE: if (get_user(tmp, p)) return -EFAULT; if (tmp > 255) /* max for unchar */ return -EINVAL; spin_lock_bh(&sl->lock); if (!sl->tty) { spin_unlock_bh(&sl->lock); return -ENODEV; } sl->keepalive = (u8)tmp; if (sl->keepalive != 0) { mod_timer(&sl->keepalive_timer, jiffies + sl->keepalive * HZ); set_bit(SLF_KEEPTEST, &sl->flags); } else timer_delete(&sl->keepalive_timer); spin_unlock_bh(&sl->lock); return 0; case SIOCGKEEPALIVE: if (put_user(sl->keepalive, p)) return -EFAULT; return 0; case SIOCSOUTFILL: if (get_user(tmp, p)) return -EFAULT; if (tmp > 255) /* max for unchar */ return -EINVAL; spin_lock_bh(&sl->lock); if (!sl->tty) { spin_unlock_bh(&sl->lock); return -ENODEV; } sl->outfill = (u8)tmp; if (sl->outfill != 0) { mod_timer(&sl->outfill_timer, jiffies + sl->outfill * HZ); set_bit(SLF_OUTWAIT, &sl->flags); } else timer_delete(&sl->outfill_timer); spin_unlock_bh(&sl->lock); return 0; case SIOCGOUTFILL: if (put_user(sl->outfill, p)) return -EFAULT; return 0; /* VSV changes end */ #endif default: return tty_mode_ioctl(tty, cmd, arg); } } /* VSV changes start here */ #ifdef CONFIG_SLIP_SMART /* function sl_siocdevprivate called from net/core/dev.c to allow get/set outfill/keepalive parameter by ifconfig */ static int sl_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd) { struct slip *sl = netdev_priv(dev); unsigned long *p = (unsigned long *)&rq->ifr_ifru; if (sl == NULL) /* Allocation failed ?? */ return -ENODEV; if (in_compat_syscall()) return -EOPNOTSUPP; spin_lock_bh(&sl->lock); if (!sl->tty) { spin_unlock_bh(&sl->lock); return -ENODEV; } switch (cmd) { case SIOCSKEEPALIVE: /* max for unchar */ if ((unsigned)*p > 255) { spin_unlock_bh(&sl->lock); return -EINVAL; } sl->keepalive = (u8)*p; if (sl->keepalive != 0) { sl->keepalive_timer.expires = jiffies + sl->keepalive * HZ; mod_timer(&sl->keepalive_timer, jiffies + sl->keepalive * HZ); set_bit(SLF_KEEPTEST, &sl->flags); } else timer_delete(&sl->keepalive_timer); break; case SIOCGKEEPALIVE: *p = sl->keepalive; break; case SIOCSOUTFILL: if ((unsigned)*p > 255) { /* max for unchar */ spin_unlock_bh(&sl->lock); return -EINVAL; } sl->outfill = (u8)*p; if (sl->outfill != 0) { mod_timer(&sl->outfill_timer, jiffies + sl->outfill * HZ); set_bit(SLF_OUTWAIT, &sl->flags); } else timer_delete(&sl->outfill_timer); break; case SIOCGOUTFILL: *p = sl->outfill; break; case SIOCSLEASE: /* Resolve race condition, when ioctl'ing hanged up and opened by another process device. */ if (sl->tty != current->signal->tty && sl->pid != current->pid) { spin_unlock_bh(&sl->lock); return -EPERM; } sl->leased = 0; if (*p) sl->leased = 1; break; case SIOCGLEASE: *p = sl->leased; } spin_unlock_bh(&sl->lock); return 0; } #endif /* VSV changes end */ static struct tty_ldisc_ops sl_ldisc = { .owner = THIS_MODULE, .num = N_SLIP, .name = "slip", .open = slip_open, .close = slip_close, .hangup = slip_hangup, .ioctl = slip_ioctl, .receive_buf = slip_receive_buf, .write_wakeup = slip_write_wakeup, }; static int __init slip_init(void) { int status; if (slip_maxdev < 4) slip_maxdev = 4; /* Sanity */ printk(KERN_INFO "SLIP: version %s (dynamic channels, max=%d)" #ifdef CONFIG_SLIP_MODE_SLIP6 " (6 bit encapsulation enabled)" #endif ".\n", SLIP_VERSION, slip_maxdev); #if defined(SL_INCLUDE_CSLIP) printk(KERN_INFO "CSLIP: code copyright 1989 Regents of the University of California.\n"); #endif #ifdef CONFIG_SLIP_SMART printk(KERN_INFO "SLIP linefill/keepalive option.\n"); #endif slip_devs = kcalloc(slip_maxdev, sizeof(struct net_device *), GFP_KERNEL); if (!slip_devs) return -ENOMEM; /* Fill in our line protocol discipline, and register it */ status = tty_register_ldisc(&sl_ldisc); if (status != 0) { printk(KERN_ERR "SLIP: can't register line discipline (err = %d)\n", status); kfree(slip_devs); } return status; } static void __exit slip_exit(void) { int i; struct net_device *dev; struct slip *sl; unsigned long timeout = jiffies + HZ; int busy = 0; if (slip_devs == NULL) return; /* First of all: check for active disciplines and hangup them. */ do { if (busy) msleep_interruptible(100); busy = 0; for (i = 0; i < slip_maxdev; i++) { dev = slip_devs[i]; if (!dev) continue; sl = netdev_priv(dev); spin_lock_bh(&sl->lock); if (sl->tty) { busy++; tty_hangup(sl->tty); } spin_unlock_bh(&sl->lock); } } while (busy && time_before(jiffies, timeout)); /* FIXME: hangup is async so we should wait when doing this second phase */ for (i = 0; i < slip_maxdev; i++) { dev = slip_devs[i]; if (!dev) continue; slip_devs[i] = NULL; sl = netdev_priv(dev); if (sl->tty) { printk(KERN_ERR "%s: tty discipline still running\n", dev->name); } unregister_netdev(dev); } kfree(slip_devs); slip_devs = NULL; tty_unregister_ldisc(&sl_ldisc); } module_init(slip_init); module_exit(slip_exit); #ifdef CONFIG_SLIP_SMART /* * This is start of the code for multislip style line checking * added by Stanislav Voronyi. All changes before marked VSV */ static void sl_outfill(struct timer_list *t) { struct slip *sl = timer_container_of(sl, t, outfill_timer); spin_lock(&sl->lock); if (sl->tty == NULL) goto out; if (sl->outfill) { if (test_bit(SLF_OUTWAIT, &sl->flags)) { /* no packets were transmitted, do outfill */ #ifdef CONFIG_SLIP_MODE_SLIP6 unsigned char s = (sl->mode & SL_MODE_SLIP6)?0x70:END; #else unsigned char s = END; #endif /* put END into tty queue. Is it right ??? */ if (!netif_queue_stopped(sl->dev)) { /* if device busy no outfill */ sl->tty->ops->write(sl->tty, &s, 1); } } else set_bit(SLF_OUTWAIT, &sl->flags); mod_timer(&sl->outfill_timer, jiffies+sl->outfill*HZ); } out: spin_unlock(&sl->lock); } static void sl_keepalive(struct timer_list *t) { struct slip *sl = timer_container_of(sl, t, keepalive_timer); spin_lock(&sl->lock); if (sl->tty == NULL) goto out; if (sl->keepalive) { if (test_bit(SLF_KEEPTEST, &sl->flags)) { /* keepalive still high :(, we must hangup */ if (sl->outfill) /* outfill timer must be deleted too */ (void) timer_delete(&sl->outfill_timer); printk(KERN_DEBUG "%s: no packets received during keepalive timeout, hangup.\n", sl->dev->name); /* this must hangup tty & close slip */ tty_hangup(sl->tty); /* I think we need not something else */ goto out; } else set_bit(SLF_KEEPTEST, &sl->flags); mod_timer(&sl->keepalive_timer, jiffies+sl->keepalive*HZ); } out: spin_unlock(&sl->lock); } #endif MODULE_DESCRIPTION("SLIP (serial line) protocol module"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_SLIP);
41 41 41 41 41 41 41 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #include "flow.h" #include "datapath.h" #include "flow_netlink.h" #include <linux/uaccess.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <net/llc_pdu.h> #include <linux/kernel.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/llc.h> #include <linux/module.h> #include <linux/in.h> #include <linux/rcupdate.h> #include <linux/cpumask.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/rculist.h> #include <linux/sort.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #define TBL_MIN_BUCKETS 1024 #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) #define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; struct kmem_cache *flow_stats_cache __read_mostly; static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; } void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask) { int start = full ? 0 : mask->range.start; int len = full ? sizeof *dst : range_n_bytes(&mask->range); const long *m = (const long *)((const u8 *)&mask->key + start); const long *s = (const long *)((const u8 *)src + start); long *d = (long *)((u8 *)dst + start); int i; /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, * if 'full' is false the memory outside of the 'mask->range' is left * uninitialized. This can be used as an optimization when further * operations on 'dst' only use contents within 'mask->range'. */ for (i = 0; i < len; i += sizeof(long)) *d++ = *s++ & *m++; } struct sw_flow *ovs_flow_alloc(void) { struct sw_flow *flow; struct sw_flow_stats *stats; flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); flow->stats_last_writer = -1; flow->cpu_used_mask = (struct cpumask *)&flow->stats[nr_cpu_ids]; /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, GFP_KERNEL | __GFP_ZERO, node_online(0) ? 0 : NUMA_NO_NODE); if (!stats) goto err; spin_lock_init(&stats->lock); RCU_INIT_POINTER(flow->stats[0], stats); cpumask_set_cpu(0, flow->cpu_used_mask); return flow; err: kmem_cache_free(flow_cache, flow); return ERR_PTR(-ENOMEM); } int ovs_flow_tbl_count(const struct flow_table *table) { return table->count; } static void flow_free(struct sw_flow *flow) { int cpu; if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *) flow->sf_acts); /* We open code this to make sure cpu 0 is always considered */ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, flow->cpu_used_mask)) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); } kmem_cache_free(flow_cache, flow); } static void rcu_free_flow_callback(struct rcu_head *rcu) { struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); flow_free(flow); } void ovs_flow_free(struct sw_flow *flow, bool deferred) { if (!flow) return; if (deferred) call_rcu(&flow->rcu, rcu_free_flow_callback); else flow_free(flow); } static void __table_instance_destroy(struct table_instance *ti) { kvfree(ti->buckets); kfree(ti); } static struct table_instance *table_instance_alloc(int new_size) { struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); int i; if (!ti) return NULL; ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head), GFP_KERNEL); if (!ti->buckets) { kfree(ti); return NULL; } for (i = 0; i < new_size; i++) INIT_HLIST_HEAD(&ti->buckets[i]); ti->n_buckets = new_size; ti->node_ver = 0; get_random_bytes(&ti->hash_seed, sizeof(u32)); return ti; } static void __mask_array_destroy(struct mask_array *ma) { free_percpu(ma->masks_usage_stats); kfree(ma); } static void mask_array_rcu_cb(struct rcu_head *rcu) { struct mask_array *ma = container_of(rcu, struct mask_array, rcu); __mask_array_destroy(ma); } static void tbl_mask_array_reset_counters(struct mask_array *ma) { int i, cpu; /* As the per CPU counters are not atomic we can not go ahead and * reset them from another CPU. To be able to still have an approximate * zero based counter we store the value at reset, and subtract it * later when processing. */ for (i = 0; i < ma->max; i++) { ma->masks_usage_zero_cntr[i] = 0; for_each_possible_cpu(cpu) { struct mask_array_stats *stats; unsigned int start; u64 counter; stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; } while (u64_stats_fetch_retry(&stats->syncp, start)); ma->masks_usage_zero_cntr[i] += counter; } } } static struct mask_array *tbl_mask_array_alloc(int size) { struct mask_array *new; size = max(MASK_ARRAY_SIZE_MIN, size); new = kzalloc(struct_size(new, masks, size) + sizeof(u64) * size, GFP_KERNEL); if (!new) return NULL; new->masks_usage_zero_cntr = (u64 *)((u8 *)new + struct_size(new, masks, size)); new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) + sizeof(u64) * size, __alignof__(u64)); if (!new->masks_usage_stats) { kfree(new); return NULL; } new->count = 0; new->max = size; return new; } static int tbl_mask_array_realloc(struct flow_table *tbl, int size) { struct mask_array *old; struct mask_array *new; new = tbl_mask_array_alloc(size); if (!new) return -ENOMEM; old = ovsl_dereference(tbl->mask_array); if (old) { int i; for (i = 0; i < old->max; i++) { if (ovsl_dereference(old->masks[i])) new->masks[new->count++] = old->masks[i]; } call_rcu(&old->rcu, mask_array_rcu_cb); } rcu_assign_pointer(tbl->mask_array, new); return 0; } static int tbl_mask_array_add_mask(struct flow_table *tbl, struct sw_flow_mask *new) { struct mask_array *ma = ovsl_dereference(tbl->mask_array); int err, ma_count = READ_ONCE(ma->count); if (ma_count >= ma->max) { err = tbl_mask_array_realloc(tbl, ma->max + MASK_ARRAY_SIZE_MIN); if (err) return err; ma = ovsl_dereference(tbl->mask_array); } else { /* On every add or delete we need to reset the counters so * every new mask gets a fair chance of being prioritized. */ tbl_mask_array_reset_counters(ma); } BUG_ON(ovsl_dereference(ma->masks[ma_count])); rcu_assign_pointer(ma->masks[ma_count], new); WRITE_ONCE(ma->count, ma_count + 1); return 0; } static void tbl_mask_array_del_mask(struct flow_table *tbl, struct sw_flow_mask *mask) { struct mask_array *ma = ovsl_dereference(tbl->mask_array); int i, ma_count = READ_ONCE(ma->count); /* Remove the deleted mask pointers from the array */ for (i = 0; i < ma_count; i++) { if (mask == ovsl_dereference(ma->masks[i])) goto found; } BUG(); return; found: WRITE_ONCE(ma->count, ma_count - 1); rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]); RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL); kfree_rcu(mask, rcu); /* Shrink the mask array if necessary. */ if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && ma_count <= (ma->max / 3)) tbl_mask_array_realloc(tbl, ma->max / 2); else tbl_mask_array_reset_counters(ma); } /* Remove 'mask' from the mask list, if it is not needed any more. */ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) { if (mask) { /* ovs-lock is required to protect mask-refcount and * mask list. */ ASSERT_OVSL(); BUG_ON(!mask->ref_count); mask->ref_count--; if (!mask->ref_count) tbl_mask_array_del_mask(tbl, mask); } } static void __mask_cache_destroy(struct mask_cache *mc) { free_percpu(mc->mask_cache); kfree(mc); } static void mask_cache_rcu_cb(struct rcu_head *rcu) { struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); __mask_cache_destroy(mc); } static struct mask_cache *tbl_mask_cache_alloc(u32 size) { struct mask_cache_entry __percpu *cache = NULL; struct mask_cache *new; /* Only allow size to be 0, or a power of 2, and does not exceed * percpu allocation size. */ if ((!is_power_of_2(size) && size != 0) || (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) return NULL; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; new->cache_size = size; if (new->cache_size > 0) { cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), new->cache_size), __alignof__(struct mask_cache_entry)); if (!cache) { kfree(new); return NULL; } } new->mask_cache = cache; return new; } int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) { struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); struct mask_cache *new; if (size == mc->cache_size) return 0; if ((!is_power_of_2(size) && size != 0) || (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) return -EINVAL; new = tbl_mask_cache_alloc(size); if (!new) return -ENOMEM; rcu_assign_pointer(table->mask_cache, new); call_rcu(&mc->rcu, mask_cache_rcu_cb); return 0; } int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; struct mask_cache *mc; struct mask_array *ma; mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); if (!ma) goto free_mask_cache; ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ti) goto free_mask_array; ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ufid_ti) goto free_ti; rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; return 0; free_ti: __table_instance_destroy(ti); free_mask_array: __mask_array_destroy(ma); free_mask_cache: __mask_cache_destroy(mc); return -ENOMEM; } static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { struct table_instance *ti; ti = container_of(rcu, struct table_instance, rcu); __table_instance_destroy(ti); } static void table_instance_flow_free(struct flow_table *table, struct table_instance *ti, struct table_instance *ufid_ti, struct sw_flow *flow) { hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); table->count--; if (ovs_identifier_is_ufid(&flow->id)) { hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); table->ufid_count--; } flow_mask_remove(table, flow->mask); } /* Must be called with OVS mutex held. */ void table_instance_flow_flush(struct flow_table *table, struct table_instance *ti, struct table_instance *ufid_ti) { int i; for (i = 0; i < ti->n_buckets; i++) { struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; struct sw_flow *flow; hlist_for_each_entry_safe(flow, n, head, flow_table.node[ti->node_ver]) { table_instance_flow_free(table, ti, ufid_ti, flow); ovs_flow_free(flow, true); } } if (WARN_ON(table->count != 0 || table->ufid_count != 0)) { table->count = 0; table->ufid_count = 0; } } static void table_instance_destroy(struct table_instance *ti, struct table_instance *ufid_ti) { call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); } /* No need for locking this function is called from RCU callback or * error path. */ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); struct mask_array *ma = rcu_dereference_raw(table->mask_array); call_rcu(&mc->rcu, mask_cache_rcu_cb); call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(ti, ufid_ti); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, u32 *bucket, u32 *last) { struct sw_flow *flow; struct hlist_head *head; int ver; int i; ver = ti->node_ver; while (*bucket < ti->n_buckets) { i = 0; head = &ti->buckets[*bucket]; hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { if (i < *last) { i++; continue; } *last = i + 1; return flow; } (*bucket)++; *last = 0; } return NULL; } static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) { hash = jhash_1word(hash, ti->hash_seed); return &ti->buckets[hash & (ti->n_buckets - 1)]; } static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow) { struct hlist_head *head; head = find_bucket(ti, flow->flow_table.hash); hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head); } static void ufid_table_instance_insert(struct table_instance *ti, struct sw_flow *flow) { struct hlist_head *head; head = find_bucket(ti, flow->ufid_table.hash); hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head); } static void flow_table_copy_flows(struct table_instance *old, struct table_instance *new, bool ufid) { int old_ver; int i; old_ver = old->node_ver; new->node_ver = !old_ver; /* Insert in new table. */ for (i = 0; i < old->n_buckets; i++) { struct sw_flow *flow; struct hlist_head *head = &old->buckets[i]; if (ufid) hlist_for_each_entry_rcu(flow, head, ufid_table.node[old_ver], lockdep_ovsl_is_held()) ufid_table_instance_insert(new, flow); else hlist_for_each_entry_rcu(flow, head, flow_table.node[old_ver], lockdep_ovsl_is_held()) table_instance_insert(new, flow); } } static struct table_instance *table_instance_rehash(struct table_instance *ti, int n_buckets, bool ufid) { struct table_instance *new_ti; new_ti = table_instance_alloc(n_buckets); if (!new_ti) return NULL; flow_table_copy_flows(ti, new_ti, ufid); return new_ti; } int ovs_flow_tbl_flush(struct flow_table *flow_table) { struct table_instance *old_ti, *new_ti; struct table_instance *old_ufid_ti, *new_ufid_ti; new_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!new_ti) return -ENOMEM; new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!new_ufid_ti) goto err_free_ti; old_ti = ovsl_dereference(flow_table->ti); old_ufid_ti = ovsl_dereference(flow_table->ufid_ti); rcu_assign_pointer(flow_table->ti, new_ti); rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); flow_table->last_rehash = jiffies; table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); table_instance_destroy(old_ti, old_ufid_ti); return 0; err_free_ti: __table_instance_destroy(new_ti); return -ENOMEM; } static u32 flow_hash(const struct sw_flow_key *key, const struct sw_flow_key_range *range) { const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); /* Make sure number of hash bytes are multiple of u32. */ int hash_u32s = range_n_bytes(range) >> 2; return jhash2(hash_key, hash_u32s, 0); } static int flow_key_start(const struct sw_flow_key *key) { if (key->tun_proto) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), sizeof(long)); } static bool cmp_key(const struct sw_flow_key *key1, const struct sw_flow_key *key2, int key_start, int key_end) { const long *cp1 = (const long *)((const u8 *)key1 + key_start); const long *cp2 = (const long *)((const u8 *)key2 + key_start); int i; for (i = key_start; i < key_end; i += sizeof(long)) if (*cp1++ ^ *cp2++) return false; return true; } static bool flow_cmp_masked_key(const struct sw_flow *flow, const struct sw_flow_key *key, const struct sw_flow_key_range *range) { return cmp_key(&flow->key, key, range->start, range->end); } static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, const struct sw_flow_match *match) { struct sw_flow_key *key = match->key; int key_start = flow_key_start(key); int key_end = match->range.end; BUG_ON(ovs_identifier_is_ufid(&flow->id)); return cmp_key(flow->id.unmasked_key, key, key_start, key_end); } static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, const struct sw_flow_mask *mask, u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; u32 hash; struct sw_flow_key masked_key; ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); (*n_mask_hit)++; hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver], lockdep_ovsl_is_held()) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; } return NULL; } /* Flow lookup does full lookup on flow table. It starts with * mask from index passed in *index. * This function MUST be called with BH disabled due to the use * of CPU specific variables. */ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *n_cache_hit, u32 *index) { struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); struct sw_flow *flow; struct sw_flow_mask *mask; int i; if (likely(*index < ma->max)) { mask = rcu_dereference_ovsl(ma->masks[*index]); if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { u64_stats_update_begin(&stats->syncp); stats->usage_cntrs[*index]++; u64_stats_update_end(&stats->syncp); (*n_cache_hit)++; return flow; } } } for (i = 0; i < ma->max; i++) { if (i == *index) continue; mask = rcu_dereference_ovsl(ma->masks[i]); if (unlikely(!mask)) break; flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; u64_stats_update_begin(&stats->syncp); stats->usage_cntrs[*index]++; u64_stats_update_end(&stats->syncp); return flow; } } return NULL; } /* * mask_cache maps flow to probable mask. This cache is not tightly * coupled cache, It means updates to mask list can result in inconsistent * cache entry in mask cache. * This is per cpu cache and is divided in MC_HASH_SEGS segments. * In case of a hash collision the entry is hashed in next segment. * */ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, u32 *n_mask_hit, u32 *n_cache_hit) { struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; struct sw_flow *flow; u32 hash; int seg; *n_mask_hit = 0; *n_cache_hit = 0; if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; u32 cache = 0; return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash * value. To avoid hash collisions, rehash the 'skb_hash' with * 'recirc_id'. */ if (key->recirc_id) skb_hash = jhash_1word(skb_hash, key->recirc_id); ce = NULL; hash = skb_hash; entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; } if (!ce || e->skb_hash < ce->skb_hash) ce = e; /* A better replacement cache candidate. */ hash >>= MC_HASH_SHIFT; } /* Cache miss, do full lookup. */ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, &ce->mask_index); if (flow) ce->skb_hash = skb_hash; *n_cache_hit = 0; return flow; } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, const struct sw_flow_key *key) { struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; u32 __always_unused n_cache_hit; struct sw_flow *flow; u32 index = 0; /* This function gets called trough the netlink interface and therefore * is preemptible. However, flow_lookup() function needs to be called * with BH disabled due to CPU specific variables. */ local_bh_disable(); flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); local_bh_enable(); return flow; } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, const struct sw_flow_match *match) { struct mask_array *ma = ovsl_dereference(tbl->mask_array); int i; /* Always called under ovs-mutex. */ for (i = 0; i < ma->max; i++) { struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 __always_unused n_mask_hit; struct sw_flow_mask *mask; struct sw_flow *flow; mask = ovsl_dereference(ma->masks[i]); if (!mask) continue; flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); if (flow && ovs_identifier_is_key(&flow->id) && ovs_flow_cmp_unmasked_key(flow, match)) { return flow; } } return NULL; } static u32 ufid_hash(const struct sw_flow_id *sfid) { return jhash(sfid->ufid, sfid->ufid_len, 0); } static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, const struct sw_flow_id *sfid) { if (flow->id.ufid_len != sfid->ufid_len) return false; return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); } bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) { if (ovs_identifier_is_ufid(&flow->id)) return flow_cmp_masked_key(flow, match->key, &match->range); return ovs_flow_cmp_unmasked_key(flow, match); } struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, const struct sw_flow_id *ufid) { struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti); struct sw_flow *flow; struct hlist_head *head; u32 hash; hash = ufid_hash(ufid); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver], lockdep_ovsl_is_held()) { if (flow->ufid_table.hash == hash && ovs_flow_cmp_ufid(flow, ufid)) return flow; } return NULL; } int ovs_flow_tbl_num_masks(const struct flow_table *table) { struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); return READ_ONCE(ma->count); } u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) { struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); return READ_ONCE(mc->cache_size); } static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { return table_instance_rehash(ti, ti->n_buckets * 2, ufid); } /* Must be called with OVS mutex held. */ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { struct table_instance *ti = ovsl_dereference(table->ti); struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); table_instance_flow_free(table, ti, ufid_ti, flow); } static struct sw_flow_mask *mask_alloc(void) { struct sw_flow_mask *mask; mask = kmalloc(sizeof(*mask), GFP_KERNEL); if (mask) mask->ref_count = 1; return mask; } static bool mask_equal(const struct sw_flow_mask *a, const struct sw_flow_mask *b) { const u8 *a_ = (const u8 *)&a->key + a->range.start; const u8 *b_ = (const u8 *)&b->key + b->range.start; return (a->range.end == b->range.end) && (a->range.start == b->range.start) && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); } static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, const struct sw_flow_mask *mask) { struct mask_array *ma; int i; ma = ovsl_dereference(tbl->mask_array); for (i = 0; i < ma->max; i++) { struct sw_flow_mask *t; t = ovsl_dereference(ma->masks[i]); if (t && mask_equal(mask, t)) return t; } return NULL; } /* Add 'mask' into the mask list, if it is not already there. */ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, const struct sw_flow_mask *new) { struct sw_flow_mask *mask; mask = flow_mask_find(tbl, new); if (!mask) { /* Allocate a new mask if none exists. */ mask = mask_alloc(); if (!mask) return -ENOMEM; mask->key = new->key; mask->range = new->range; /* Add mask to mask-list. */ if (tbl_mask_array_add_mask(tbl, mask)) { kfree(mask); return -ENOMEM; } } else { BUG_ON(!mask->ref_count); mask->ref_count++; } flow->mask = mask; return 0; } /* Must be called with OVS mutex held. */ static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) { struct table_instance *new_ti = NULL; struct table_instance *ti; flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); ti = ovsl_dereference(table->ti); table_instance_insert(ti, flow); table->count++; /* Expand table, if necessary, to make room. */ if (table->count > ti->n_buckets) new_ti = table_instance_expand(ti, false); else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) new_ti = table_instance_rehash(ti, ti->n_buckets, false); if (new_ti) { rcu_assign_pointer(table->ti, new_ti); call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); table->last_rehash = jiffies; } } /* Must be called with OVS mutex held. */ static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow) { struct table_instance *ti; flow->ufid_table.hash = ufid_hash(&flow->id); ti = ovsl_dereference(table->ufid_ti); ufid_table_instance_insert(ti, flow); table->ufid_count++; /* Expand table, if necessary, to make room. */ if (table->ufid_count > ti->n_buckets) { struct table_instance *new_ti; new_ti = table_instance_expand(ti, true); if (new_ti) { rcu_assign_pointer(table->ufid_ti, new_ti); call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); } } } /* Must be called with OVS mutex held. */ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask) { int err; err = flow_mask_insert(table, flow, mask); if (err) return err; flow_key_insert(table, flow); if (ovs_identifier_is_ufid(&flow->id)) flow_ufid_insert(table, flow); return 0; } static int compare_mask_and_count(const void *a, const void *b) { const struct mask_count *mc_a = a; const struct mask_count *mc_b = b; return (s64)mc_b->counter - (s64)mc_a->counter; } /* Must be called with OVS mutex held. */ void ovs_flow_masks_rebalance(struct flow_table *table) { struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); struct mask_count *masks_and_count; struct mask_array *new; int masks_entries = 0; int i; /* Build array of all current entries with use counters. */ masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), GFP_KERNEL); if (!masks_and_count) return; for (i = 0; i < ma->max; i++) { struct sw_flow_mask *mask; int cpu; mask = rcu_dereference_ovsl(ma->masks[i]); if (unlikely(!mask)) break; masks_and_count[i].index = i; masks_and_count[i].counter = 0; for_each_possible_cpu(cpu) { struct mask_array_stats *stats; unsigned int start; u64 counter; stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; } while (u64_stats_fetch_retry(&stats->syncp, start)); masks_and_count[i].counter += counter; } /* Subtract the zero count value. */ masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i]; /* Rather than calling tbl_mask_array_reset_counters() * below when no change is needed, do it inline here. */ ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter; } if (i == 0) goto free_mask_entries; /* Sort the entries */ masks_entries = i; sort(masks_and_count, masks_entries, sizeof(*masks_and_count), compare_mask_and_count, NULL); /* If the order is the same, nothing to do... */ for (i = 0; i < masks_entries; i++) { if (i != masks_and_count[i].index) break; } if (i == masks_entries) goto free_mask_entries; /* Rebuilt the new list in order of usage. */ new = tbl_mask_array_alloc(ma->max); if (!new) goto free_mask_entries; for (i = 0; i < masks_entries; i++) { int index = masks_and_count[i].index; if (ovsl_dereference(ma->masks[index])) new->masks[new->count++] = ma->masks[index]; } rcu_assign_pointer(table->mask_array, new); call_rcu(&ma->rcu, mask_array_rcu_cb); free_mask_entries: kfree(masks_and_count); } /* Initializes the flow module. * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) { BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + (nr_cpu_ids * sizeof(struct sw_flow_stats *)) + cpumask_size(), 0, 0, NULL); if (flow_cache == NULL) return -ENOMEM; flow_stats_cache = kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats), 0, SLAB_HWCACHE_ALIGN, NULL); if (flow_stats_cache == NULL) { kmem_cache_destroy(flow_cache); flow_cache = NULL; return -ENOMEM; } return 0; } /* Uninitializes the flow module. */ void ovs_flow_exit(void) { kmem_cache_destroy(flow_stats_cache); kmem_cache_destroy(flow_cache); }
40 22 352 3428 12 3183 1357 1 1448 253 4 243 223 2209 1779 1095 11928 4078 4645 3178 7721 9661 6 265 725 265 4109 9 2193 2195 4834 96 3601 2269 48 2273 2269 18 17 9655 2502 3587 41 2758 2757 16 16 161 15983 6208 1074 13215 4076 1403 24 115 974 5711 277 278 251 161 125 252 347 77 71 13 52 1 4309 6890 2593 28 1585 24 762 1 26 1417 3368 722 2541 1 25 26 5005 19290 19289 18567 2091 18537 2401 363 2731 5125 12999 5125 5354 5357 2730 2733 2733 2731 5067 5293 12999 5539 5539 5539 2403 384 5326 2423 2428 2097 1399 55 6 1075 2 1 3 737 4 112 261 20 3 9761 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/compiler.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/rcuwait.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; struct folio_batch; void arch_mm_preinit(void); void mm_core_init(void); void init_mm_internals(void); extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS # define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else # define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif #include <asm/page.h> #include <asm/processor.h> #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { case 96: _pp[11] = 0; fallthrough; case 88: _pp[10] = 0; fallthrough; case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) #define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); } void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else /* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 #if CONFIG_ARCH_PKEY_BITS > 3 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #else # define VM_PKEY_BIT3 0 #endif #if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #endif #if defined(CONFIG_ARM64_GCS) /* * arm64's Guarded Control Stack implements similar functionality and * has similar constraints to shadow stacks. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_6 #endif #ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR # define VM_UFFD_MINOR_BIT 41 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ /* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type. */ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif #ifdef CONFIG_64BIT #define VM_DROPPABLE_BIT 40 #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) #elif defined(CONFIG_PPC32) #define VM_DROPPABLE VM_ARCH_1 #else #define VM_DROPPABLE VM_NONE #endif #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN #define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; #ifdef CONFIG_NUMA_BALANCING static inline void vma_numab_state_init(struct vm_area_struct *vma) { vma->numab_state = NULL; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { kfree(vma->numab_state); } #else static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ /* * These must be here rather than mmap_lock.h as dependent on vm_fault type, * declared in this header. */ #ifdef CONFIG_PER_VMA_LOCK static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_end_read(vmf->vma); else mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); else mmap_assert_locked(vmf->vma->vm_mm); } #else static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { ACCESS_PRIVATE(vma, __vm_flags) = flags; } /* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand. */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } /* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking. */ static inline void __vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vm_flags_init(vma, (vma->vm_flags | set) & ~clear); } /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. */ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vma_start_write(vma); __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } /* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task. */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { return vma->vm_start < vma->vm_mm->brk && vma->vm_end > vma->vm_mm->start_brk; } /* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go. */ return vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } static inline bool is_shared_maywrite(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) { return mas_next_range(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); } static inline int vma_iter_bulk_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { vmi->mas.index = vma->vm_start; vmi->mas.last = vma->vm_end - 1; mas_store(&vmi->mas, vma); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; vma_mark_attached(vma); return 0; } static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); } static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) { mas_set(&vmi->mas, addr); } #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; extern void prep_compound_page(struct page *page, unsigned int order); static inline unsigned int folio_large_order(const struct folio *folio) { return folio->_flags_1 & 0xff; } #ifdef NR_PAGES_IN_LARGE_FOLIO static inline long folio_large_nr_pages(const struct folio *folio) { return folio->_nr_pages; } #else static inline long folio_large_nr_pages(const struct folio *folio) { return 1L << folio_large_order(folio); } #endif /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio_large_order(folio); } /** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio. */ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio_large_order(folio); } /** * folio_reset_order - Reset the folio order and derived _nr_pages * @folio: The folio. * * Reset the order and derived _nr_pages to 0. Must only be used in the * process of splitting large folios. */ static inline void folio_reset_order(struct folio *folio) { if (WARN_ON_ONCE(!folio_test_large(folio))) return; folio->_flags_1 &= ~0xffUL; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif } #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } static inline int folio_put_testzero(struct folio *folio) { return put_page_testzero(&folio->page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } static inline struct folio *folio_get_nontail_page(struct page *page) { if (unlikely(!get_page_unless_zero(page))) return NULL; return (struct folio *)page; } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) return 0; return atomic_read(&folio->_entire_mapcount) + 1; } static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_large_mapcount) + 1; } /** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(const struct folio *folio) { int mapcount; if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } return folio_large_mapcount(folio); } /** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables. */ static inline bool folio_mapped(const struct folio *folio) { return folio_mapcount(folio) >= 1; } /* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ static inline bool page_mapped(const struct page *page) { return folio_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); return compound_head(page); } static inline struct folio *virt_to_folio(const void *x) { struct page *page = virt_to_page(x); return page_folio(page); } void __folio_put(struct folio *folio); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); } /* Returns the number of bits needed for the number of bytes in a page */ static inline unsigned int page_shift(struct page *page) { return PAGE_SHIFT + compound_order(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page); } /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pte = pte_mkwrite(pte, vma); return pte; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); #endif /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A folio may belong to an inode's memory mapping. In this case, * folio->mapping points to the inode, and folio->index is the file * offset of the folio, in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory. */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) /** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use. */ static inline void folio_get(struct folio *folio) { VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); folio_ref_inc(folio); } static inline void get_page(struct page *page) { struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_test_slab(folio))) return; folio_get(folio); } static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; page_ref_inc(page); return true; } /** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) __folio_put(folio); } /** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) __folio_put(folio); } void folios_put_refs(struct folio_batch *folios, unsigned int *refs); /* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away. */ typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need to * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folios_put(struct folio_batch *folios) { folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); if (folio_test_slab(folio)) return; folio_put(folio); } /* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone. */ static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif static inline int folio_nid(const struct folio *folio) { return page_to_nid(&folio->page); } #ifdef CONFIG_NUMA_BALANCING /* page access time bits needs to hold at least 4 seconds */ #define PAGE_ACCESS_TIME_MIN_BITS 12 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS #define PAGE_ACCESS_TIME_BUCKETS \ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) #else #define PAGE_ACCESS_TIME_BUCKETS 0 #endif #define PAGE_ACCESS_TIME_MASK \ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } static inline int cpupid_to_pid(int cpupid) { return cpupid & LAST__PID_MASK; } static inline int cpupid_to_cpu(int cpupid) { return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } static inline int cpupid_to_nid(int cpupid) { return cpu_to_node(cpupid_to_cpu(cpupid)); } static inline bool cpupid_pid_unset(int cpupid) { return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } static inline bool cpupid_cpu_unset(int cpupid) { return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) { return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); } #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int folio_last_cpupid(struct folio *folio) { return folio->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int folio_last_cpupid(struct folio *folio) { return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ static inline int folio_xchg_access_time(struct folio *folio, int time) { int last_time; last_time = folio_xchg_last_cpupid(folio, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return folio_nid(folio); /* XXX */ } static inline int folio_xchg_access_time(struct folio *folio, int time) { return 0; } static inline int folio_last_cpupid(struct folio *folio) { return folio_nid(folio); /* XXX */ } static inline int cpupid_to_nid(int cpupid) { return -1; } static inline int cpupid_to_pid(int cpupid) { return -1; } static inline int cpupid_to_cpu(int cpupid) { return -1; } static inline int cpu_pid_to_cpupid(int nid, int pid) { return -1; } static inline bool cpupid_pid_unset(int cpupid) { return true; } static inline void page_cpupid_reset_last(struct page *page) { } static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } static inline bool folio_use_access_time(struct folio *folio) { return false; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff. */ static inline u8 page_kasan_tag(const struct page *page) { u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { unsigned long old_flags, flags; if (!kasan_enabled()) return; tag ^= 0xff; old_flags = READ_ONCE(page->flags); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline u8 page_kasan_tag(const struct page *page) { return 0xff; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } static inline pg_data_t *page_pgdat(const struct page *page) { return NODE_DATA(page_to_nid(page)); } static inline struct zone *folio_zone(const struct folio *folio) { return page_zone(&folio->page); } static inline pg_data_t *folio_pgdat(const struct folio *folio) { return page_pgdat(&folio->page); } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif /** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio. */ static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } static inline struct folio *pfn_folio(unsigned long pfn) { return page_folio(pfn_to_page(pfn)); } #ifdef CONFIG_MMU static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } /** * folio_mk_pte - Create a PTE for this folio * @folio: The folio to create a PTE for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_ptes(). * * Return: A page table entry suitable for mapping this folio. */ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) { return pfn_pte(folio_pfn(folio), pgprot); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE /** * folio_mk_pmd - Create a PMD for this folio * @folio: The folio to create a PMD for * @pgprot: The page protection bits to use * * Create a page table entry for the first page of this folio. * This is suitable for passing to set_pmd_at(). * * Return: A page table entry suitable for mapping this folio. */ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) { return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); } #endif #endif /* CONFIG_MMU */ static inline bool folio_has_pincount(const struct folio *folio) { if (IS_ENABLED(CONFIG_64BIT)) return folio_test_large(folio); return folio_order(folio) > 1; } /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For most large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the folio has been "dma-pinned". * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_has_pincount(folio)) return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result. */ return ((unsigned int)folio_ref_count(folio)) >= GUP_PIN_COUNTING_BIAS; } /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; return folio_maybe_dma_pinned(folio); } /** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages. */ static inline bool is_zero_page(const struct page *page) { return is_zero_pfn(page_to_pfn(page)); } /** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages. */ static inline bool is_zero_folio(const struct folio *folio) { return is_zero_page(&folio->page); } /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) return true; /* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) return false; /* * Filesystems can only tolerate transient delays to truncate and * hole-punch operations */ if (folio_is_fsdax(folio)) return false; /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); } #else static inline bool folio_is_longterm_pinnable(struct folio *folio) { return true; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } /** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two. */ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; return folio_large_nr_pages(folio); } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) #else #define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES #endif /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; return folio_large_nr_pages(folio); } /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio. */ static inline struct folio *folio_next(struct folio *folio) { return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } /** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ static inline unsigned int folio_shift(const struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } /** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ static inline size_t folio_size(const struct folio *folio) { return PAGE_SIZE << folio_order(folio); } /** * folio_maybe_mapped_shared - Whether the folio is mapped into the page * tables of more than one MM * @folio: The folio. * * This function checks if the folio maybe currently mapped into more than one * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single * MM ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly * indicate "mapped shared" (false positive) if a folio was mapped by * more than two MMs at one point in time. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). * * This function does not consider: * #. If the folio might get mapped in the (near) future (e.g., swapcache, * pagecache, temporary unmapping for migration). * #. If the folio is mapped differently (VM_PFNMAP). * #. If hugetlb page table sharing applies. Callers might want to check * hugetlb_pmd_shared(). * * Return: Whether the folio is estimated to be mapped into more than one MM. */ static inline bool folio_maybe_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); /* Only partially-mappable folios require more care. */ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; /* * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... * simply assume "mapped shared", nobody should really care * about this for arbitrary kernel allocations. */ if (!IS_ENABLED(CONFIG_MM_ID)) return true; /* * A single mapping implies "mapped exclusively", even if the * folio flag says something different: it's easier to handle this * case here instead of on the RMAP hot path. */ if (mapcount <= 1) return false; return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); } /** * folio_expected_ref_count - calculate the expected folio refcount * @folio: the folio * * Calculate the expected folio refcount, taking references from the pagecache, * swapcache, PG_private and page table mappings into account. Useful in * combination with folio_ref_count() to detect unexpected references (e.g., * GUP or other temporary references). * * Does currently not consider references from the LRU cache. If the folio * was isolated from the LRU (which is the case during migration or split), * the LRU cache does not apply. * * Calling this function on an unmapped folio -- !folio_mapped() -- that is * locked will return a stable result. * * Calling this function on a mapped folio will not result in a stable result, * because nothing stops additional page table mappings from coming (e.g., * fork()) or going (e.g., munmap()). * * Calling this function without the folio lock will also not result in a * stable result: for example, the folio might get dropped from the swapcache * concurrently. * * However, even when called without the folio lock or on a mapped folio, * this function can be used to detect unexpected references early (for example, * if it makes sense to even lock the folio and unmap it). * * The caller must add any reference (e.g., from folio_try_get()) it might be * holding itself to the result. * * Returns the expected folio refcount. */ static inline int folio_expected_ref_count(const struct folio *folio) { const int order = folio_order(folio); int ref_count = 0; if (WARN_ON_ONCE(folio_test_slab(folio))) return 0; if (folio_test_anon(folio)) { /* One reference per page from the swapcache. */ ref_count += folio_test_swapcache(folio) << order; } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) { /* One reference per page from the pagecache. */ ref_count += !!folio->mapping << order; /* One reference from PG_private. */ ref_count += folio_test_private(folio); } /* One reference per page table mapping. */ return ref_count + folio_mapcount(folio); } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { return 0; } #endif /* * Some inline functions in vmstat.h depend on page_zone() */ #include <linux/vmstat.h> #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif #if defined(WANT_PAGE_VIRTUAL) static inline void *page_address(const struct page *page) { return page->virtual; } static inline void set_page_address(struct page *page, void *address) { page->virtual = address; } #define page_address_init() do { } while(0) #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(const struct page *page); void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif static inline void *folio_address(const struct folio *folio) { return page_address(&folio->page); } /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool page_is_pfmemalloc(const struct page *page) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)page->lru.next & BIT(1); } /* * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool folio_is_pfmemalloc(const struct folio *folio) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)folio->lru.next & BIT(1); } /* * Only to be called by the page allocator on a freshly allocated * page. */ static inline void set_page_pfmemalloc(struct page *page) { page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { page->lru.next = NULL; } /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely * drop the page in the mm, either by truncation or unmapping of the vma. By * default, the flag is not set. */ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return raw_smp_processor_id(); } #endif #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); static inline void zap_vma_pages(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); struct follow_pfnmap_args { /** * Inputs: * @vma: Pointer to @vm_area_struct struct * @address: the virtual address to walk */ struct vm_area_struct *vma; unsigned long address; /** * Internals: * * The caller shouldn't touch any of these. */ spinlock_t *lock; pte_t *ptep; /** * Outputs: * * @pfn: the PFN of the address * @addr_mask: address mask covering pfn * @pgprot: the pgprot_t of the mapping * @writable: whether the mapping is writable * @special: whether the mapping is a special mapping (real PFN maps) */ unsigned long pfn; unsigned long addr_mask; pgprot_t pgprot; bool writable; bool special; }; int follow_pfnmap_start(struct follow_pfnmap_args *args); void follow_pfnmap_end(struct follow_pfnmap_args *args); extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_folio(struct address_space *mapping, struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { unmap_mapping_range(mapping, holebegin, holelen, 0); } static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); #ifdef CONFIG_BPF_SYSCALL extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); #endif long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); /* * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. */ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, struct vm_area_struct **vmap) { struct page *page; struct vm_area_struct *vma; int got; if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) return ERR_PTR(-EINVAL); got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { put_page(page); return ERR_PTR(-EINVAL); } *vmap = vma; return page; } long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset); int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time. */ /* * Whether we should manually check if we can map individual PTEs writable, * because something (e.g., COW, uffd-wp) blocks that from happening for all * PTEs automatically in a writable mapping. */ #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) { return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; } /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { return percpu_counter_read_positive(&mm->rss_stat[member]); } void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { percpu_counter_add(&mm->rss_stat[member], value); mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { percpu_counter_inc(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { percpu_counter_dec(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } /* Optimized variant when folio is already known not to be anon */ static inline int mm_counter_file(struct folio *folio) { if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_ANONPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); } static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); } static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); if (data_race(mm->hiwater_rss) < _rss) (mm)->hiwater_rss = _rss; } static inline void update_hiwater_vm(struct mm_struct *mm) { if (mm->hiwater_vm < mm->total_vm) mm->hiwater_vm = mm->total_vm; } static inline void reset_mm_hiwater_rss(struct mm_struct *mm) { mm->hiwater_rss = get_mm_rss(mm); } static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { unsigned long hiwater_rss = get_mm_hiwater_rss(mm); if (*maxrss < hiwater_rss) *maxrss = hiwater_rss; } #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #endif #ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP static inline bool pmd_special(pmd_t pmd) { return false; } static inline pmd_t pmd_mkspecial(pmd_t pmd) { return pmd; } #endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ #ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP static inline bool pud_special(pud_t pud) { return false; } static inline pud_t pud_mkspecial(pud_t pud) { return pud; } #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; } #endif extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pte_t *ptep; __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); return ptep; } #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return 0; } #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { atomic_long_set(&mm->pgtables_bytes, 0); } static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? NULL : p4d_offset(pgd, address); } static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } #endif /* CONFIG_MMU */ static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); } static inline void *ptdesc_to_virt(const struct ptdesc *pt) { return page_to_virt(ptdesc_page(pt)); } static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); } static inline bool pagetable_is_reserved(struct ptdesc *pt) { return folio_test_reserved(ptdesc_folio(pt)); } /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order * * pagetable_alloc allocates memory for page tables as well as a page table * descriptor to describe that memory. * * Return: The ptdesc describing the allocated page tables. */ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); return page_ptdesc(page); } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) /** * pagetable_free - Free pagetables * @pt: The page table descriptor * * pagetable_free frees the memory of all page tables described by a page * table descriptor and the memory for the descriptor itself. */ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); __free_pages(page, compound_order(page)); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) { } static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); return ptlock_ptr(virt_to_ptdesc(pte)); } static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); if (!ptlock_alloc(ptdesc)) return false; spin_lock_init(ptlock_ptr(ptdesc)); return true; } #else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); __folio_clear_pgtable(folio); lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) { pagetable_dtor(ptdesc); pagetable_free(ptdesc); } static inline bool pagetable_pte_ctor(struct mm_struct *mm, struct ptdesc *ptdesc) { if (mm != &init_mm && !ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; } pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { pte_t *pte; __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); return pte; } static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); } pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pte_t *pte; __cond_lock(RCU, __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ pte_unmap(pte); \ } while (0) #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); } static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) { return page_ptdesc(pmd_pgtable_page(pmd)); } static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif return ptlock_init(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) #endif static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); return ptl; } static inline bool pagetable_pmd_ctor(struct mm_struct *mm, struct ptdesc *ptdesc) { if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); return true; } /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be * considered ready to switch to split PUD locks yet; there may be places * which need to be converted from page_table_lock. */ static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) { return &mm->page_table_lock; } static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) { spinlock_t *ptl = pud_lockptr(mm, pud); spin_lock(ptl); return ptl; } static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); static inline void mark_page_reserved(struct page *page) { SetPageReserved(page); adjust_managed_page_count(page, -1); } static inline void free_reserved_ptdesc(struct ptdesc *pt) { free_reserved_page(ptdesc_page(pt)); } /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within * range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) { int nid; unsigned long phys_pages = 0; for_each_online_node(nid) phys_pages += node_present_pages(nid); return phys_pages; } /* * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void mem_init(void); extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); static inline void show_mem(void) { __show_mem(0, NULL, MAX_NR_ZONES - 1); } extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, unsigned long start, unsigned long last); #define vma_interval_tree_foreach(vma, root, start, last) \ for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root); struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node); #endif #define anon_vma_interval_tree_foreach(avc, root, start, last) \ for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, unsigned long end_data, unsigned long start_data) { if (rlim < RLIM_INFINITY) { if (((new - start) + (end_data - start_data)) > rlim) return -ENOSPC; } return 0; } extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) { /* Ignore errors */ (void) __mm_populate(addr, len, 1); } #else static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 unsigned long flags; unsigned long length; unsigned long low_limit; unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; unsigned long start_gap; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); /* * Look up the first VMA which intersects the interval [start_addr, end_addr) * NULL if none. Assume start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address * @mm: The process address space. * @addr: The user address. * * Return: The vm_area_struct at the given address, %NULL otherwise. */ static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { return mtree_load(&mm->mm_mt, addr); } static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; /* See reasoning around the VM_SHADOW_STACK definition */ if (vma->vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; vm_start -= gap; if (vm_start > vma->vm_start) vm_start = 0; return vm_start; } static inline unsigned long vm_end_gap(struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; if (vma->vm_flags & VM_GROWSUP) { vm_end += stack_guard_gap; if (vm_end < vma->vm_end) vm_end = -PAGE_SIZE; } return vm_end; } static inline unsigned long vma_pages(struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; return vma; } static inline bool range_in_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); } #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) { return __pgprot(0); } static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); } #endif void vma_set_file(struct vm_area_struct *vma, struct file *file); #ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, bool write); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { int err = vm_insert_page(vma, addr, page); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } #ifndef io_remap_pfn_range static inline int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); } #endif static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) return VM_FAULT_OOM; else if (err == -EHWPOISON) return VM_FAULT_HWPOISON; return VM_FAULT_SIGBUS; } /* * Convert errno to return value for ->page_mkwrite() calls. * * This should eventually be merged with vmf_error() above, but will need a * careful audit of all vmf_error() callers. */ static inline vm_fault_t vmf_fs_error(int err) { if (err == 0) return VM_FAULT_LOCKED; if (err == -EFAULT || err == -EAGAIN) return VM_FAULT_NOPAGE; if (err == -ENOMEM) return VM_FAULT_OOM; /* -ENOSPC, -EDQUOT, -EIO ... */ return VM_FAULT_SIGBUS; } static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) return -ENOMEM; if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return -EFAULT; return 0; } /* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, unsigned int flags) { /* * If callers don't want to honor NUMA hinting faults, no need to * determine if we would actually have to trigger a NUMA hinting fault. */ if (!(flags & FOLL_HONOR_NUMA_FAULT)) return true; /* * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. * * Requiring a fault here even for inaccessible VMAs would mean that * FOLL_FORCE cannot make any progress, because handle_mm_fault() * refuses to process NUMA hinting faults in inaccessible VMAs. */ return !vma_is_accessible(vma); } typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); extern bool _page_poisoning_enabled_early; DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); static inline bool page_poisoning_enabled(void) { return _page_poisoning_enabled_early; } /* * For use in fast paths after init_mem_debugging() has run, or when a * false negative result is not harmful when called too early. */ static inline bool page_poisoning_enabled_static(void) { return static_branch_unlikely(&_page_poisoning_enabled); } static inline void kernel_poison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_poison_pages(page, numpages); } static inline void kernel_unpoison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_unpoison_pages(page, numpages); } #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); static inline bool want_init_on_alloc(gfp_t flags) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) return true; return flags & __GFP_ZERO; } DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free); } extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); static inline bool debug_pagealloc_enabled(void) { return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled_early; } /* * For use in fast paths after mem_debugging_and_hardening_init() has run, * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) return false; return static_branch_unlikely(&_debug_pagealloc_enabled); } /* * To support DEBUG_PAGEALLOC architecture must ensure that * __kernel_map_pages() never fails */ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { return _debug_guardpage_minorder; } static inline bool debug_guardpage_enabled(void) { return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) { if (!debug_guardpage_enabled()) return false; return PageGuard(page); } bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return false; return __set_page_guard(zone, page, order); } void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return; __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); extern int in_gate_area(struct mm_struct *mm, unsigned long addr); #else static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } #endif /* __HAVE_ARCH_GATE_AREA */ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); void drop_slab(void); #ifndef CONFIG_MMU #define randomize_va_space 0 #else extern int randomize_va_space; #endif const char * arch_vma_name(struct vm_area_struct *vma); #ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); #else static inline void print_vma_addr(char *prefix, unsigned long rip) { } #endif void *sparse_buffer_alloc(unsigned long size); unsigned long section_map_size(void); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, unsigned long ptpfn, unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next); int vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) return altmap->reserve + altmap->free; return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { altmap->alloc -= nr_pfns; } #else static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { } #endif #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long nr_pages; unsigned long nr_vmemmap_pages; if (!pgmap || !is_power_of_2(sizeof(struct page))) return false; nr_pages = pgmap_vmemmap_nr(pgmap); nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); /* * For vmemmap optimization with DAX we need minimum 2 vmemmap * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst */ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } /* * If we don't have an architecture override, use the generic rule */ #ifndef vmemmap_can_optimize #define vmemmap_can_optimize __vmemmap_can_optimize #endif #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { return false; } #endif enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE /* * Sysfs entries for memory failure handling statistics. */ extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { } static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void num_poisoned_pages_inc(unsigned long pfn) { } static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { } #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); #else static inline void memblk_nr_poison_inc(unsigned long pfn) { } static inline void memblk_nr_poison_sub(unsigned long pfn, long i) { } #endif #ifndef arch_memory_failure static inline int arch_memory_failure(unsigned long pfn, int flags) { return -ENXIO; } #endif #ifndef arch_is_platform_page static inline bool arch_is_platform_page(u64 paddr) { return false; } #endif /* * Error handlers for various types of pages. */ enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ MF_RECOVERED, /* Successfully recovered */ }; enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, MF_MSG_DIRTY_MLOCKED_LRU, MF_MSG_CLEAN_MLOCKED_LRU, MF_MSG_DIRTY_UNEVICTABLE_LRU, MF_MSG_CLEAN_UNEVICTABLE_LRU, MF_MSG_DIRTY_LRU, MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); /** * vma_is_special_huge - Are transhuge page-table entries considered special? * @vma: Pointer to the struct vm_area_struct to consider * * Whether transhuge page-table entries are considered "special" following * the definition in vm_normal_page(). * * Return: true if transhuge page-table entries should be considered special, * false otherwise. */ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) { return vma_is_dax(vma) || (vma->vm_file && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else static inline void setup_nr_node_ids(void) {} #endif extern int memcmp_pages(struct page *page1, struct page *page2); static inline int pages_identical(struct page *page1, struct page *page2) { return !memcmp_pages(page1, page2); } #ifdef CONFIG_MAPPING_DIRTY_HELPERS unsigned long clean_record_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, pgoff_t bitmap_pgoff, unsigned long *bitmap, pgoff_t *start, pgoff_t *end); unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif #ifdef CONFIG_UNACCEPTED_MEMORY bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size) { return false; } static inline void accept_memory(phys_addr_t start, unsigned long size) { } #endif static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); int reserve_mem_release_by_name(const char *name); #ifdef CONFIG_64BIT int do_mseal(unsigned long start, size_t len_in, unsigned long flags); #else static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { /* noop on 32 bit */ return 0; } #endif /* * user_alloc_needs_zeroing checks if a user folio from page allocator needs to * be zeroed or not. */ static inline bool user_alloc_needs_zeroing(void) { /* * for user folios, arch with cache aliasing requires cache flush and * arc changes folio->flags to make icache coherent with dcache, so * always return false to make caller use * clear_user_page()/clear_user_highpage(). */ return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc); } int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); /* * mseal of userspace process's system mappings. */ #ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS #define VM_SEALED_SYSMAP VM_SEALED #else #define VM_SEALED_SYSMAP VM_NONE #endif /* * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and * stashes it in the upper bits of page->pp_magic. We always want to be able to * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP * pages can have arbitrary kernel pointers stored in the same field as pp_magic * (since it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * * On architectures that set POISON_POINTER_DELTA, this is already ensured, * since this value becomes part of PP_SIGNATURE; meaning we can just use the * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is * 0, we make sure that we leave the two topmost bits empty, as that guarantees * we won't mistake a valid kernel pointer for a value we set, regardless of the * VMSPLIT setting. * * Altogether, this means that the number of bits available is constrained by * the size of an unsigned long (at the upper end, subtracting two bits per the * above), and the definition of PP_SIGNATURE (with or without * POISON_POINTER_DELTA). */ #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) #if POISON_POINTER_DELTA > 0 /* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA * index to not overlap with that if set */ #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) #else /* Always leave out the topmost two; see above. */ #define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) #endif #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for * the head page of compound page and bit 1 for pfmemalloc page, as well as the * bits used for the DMA index. page_is_pfmemalloc() is checked in * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL static inline bool page_pool_page_is_pp(struct page *page) { return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; } #else static inline bool page_pool_page_is_pp(struct page *page) { return false; } #endif #endif /* _LINUX_MM_H */
4 61 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 /* SPDX-License-Identifier: GPL-2.0-only */ /* * V9FS FID Management * * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com> */ #ifndef FS_9P_FID_H #define FS_9P_FID_H #include <linux/list.h> #include "v9fs.h" struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable, kuid_t uid, bool any); struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry) { return v9fs_fid_lookup(dentry->d_parent); } void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid); void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid); static inline struct p9_fid *clone_fid(struct p9_fid *fid) { return IS_ERR(fid) ? fid : p9_client_walk(fid, 0, NULL, 1); } static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) { struct p9_fid *fid, *nfid; fid = v9fs_fid_lookup(dentry); if (!fid || IS_ERR(fid)) return fid; nfid = clone_fid(fid); p9_fid_put(fid); return nfid; } /** * v9fs_fid_addmodes - add cache flags to fid mode (for client use only) * @fid: fid to augment * @s_flags: session info mount flags * @s_cache: session info cache flags * @f_flags: unix open flags * * make sure mode reflects flags of underlying mounts * also qid.version == 0 reflects a synthetic or legacy file system * NOTE: these are set after open so only reflect 9p client not * underlying file system on server. */ static inline void v9fs_fid_add_modes(struct p9_fid *fid, unsigned int s_flags, unsigned int s_cache, unsigned int f_flags) { if ((!s_cache) || ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) || (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) { fid->mode |= P9L_DIRECT; /* no read or write cache */ } else if ((!(s_cache & CACHE_WRITEBACK)) || (f_flags & O_DSYNC) || (s_flags & V9FS_SYNC)) { fid->mode |= P9L_NOWRITECACHE; } } #endif
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0-or-later /* * DSA tagging protocol handling * * Copyright (c) 2008-2009 Marvell Semiconductor * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org> * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch> */ #include <linux/netdevice.h> #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include "tag.h" #include "user.h" static LIST_HEAD(dsa_tag_drivers_list); static DEFINE_MUTEX(dsa_tag_drivers_lock); /* Determine if we should defer delivery of skb until we have a rx timestamp. * * Called from dsa_switch_rcv. For now, this will only work if tagging is * enabled on the switch. Normally the MAC driver would retrieve the hardware * timestamp when it reads the packet out of the hardware. However in a DSA * switch, the DSA driver owning the interface to which the packet is * delivered is never notified unless we do so here. */ static bool dsa_skb_defer_rx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; unsigned int type; if (!ds->ops->port_rxtstamp) return false; if (skb_headroom(skb) < ETH_HLEN) return false; __skb_push(skb, ETH_HLEN); type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); if (type == PTP_CLASS_NONE) return false; return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); } static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct dsa_user_priv *p; if (unlikely(!cpu_dp)) { kfree_skb(skb); return 0; } skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return 0; if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) { unsigned int port = md_dst->u.port_info.port_id; skb_dst_drop(skb); if (!skb_has_extensions(skb)) skb->slow_gro = 0; skb->dev = dsa_conduit_find_user(dev, 0, port); if (likely(skb->dev)) { dsa_default_offload_fwd_mark(skb); nskb = skb; } } else { nskb = cpu_dp->rcv(skb, dev); } if (!nskb) { kfree_skb(skb); return 0; } skb = nskb; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); if (unlikely(!dsa_user_dev_check(skb->dev))) { /* Packet is to be injected directly on an upper * device, e.g. a team/bond, so skip all DSA-port * specific actions. */ netif_rx(skb); return 0; } p = netdev_priv(skb->dev); if (unlikely(cpu_dp->ds->untag_bridge_pvid || cpu_dp->ds->untag_vlan_aware_bridge_pvid)) { nskb = dsa_software_vlan_untag(skb); if (!nskb) { kfree_skb(skb); return 0; } skb = nskb; } dev_sw_netstats_rx_add(skb->dev, skb->len + ETH_HLEN); if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; gro_cells_receive(&p->gcells, skb); return 0; } struct packet_type dsa_pack_type __read_mostly = { .type = cpu_to_be16(ETH_P_XDSA), .func = dsa_switch_rcv, }; static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver, struct module *owner) { dsa_tag_driver->owner = owner; mutex_lock(&dsa_tag_drivers_lock); list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list); mutex_unlock(&dsa_tag_drivers_lock); } void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count, struct module *owner) { unsigned int i; for (i = 0; i < count; i++) dsa_tag_driver_register(dsa_tag_driver_array[i], owner); } static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver) { mutex_lock(&dsa_tag_drivers_lock); list_del(&dsa_tag_driver->list); mutex_unlock(&dsa_tag_drivers_lock); } EXPORT_SYMBOL_GPL(dsa_tag_drivers_register); void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count) { unsigned int i; for (i = 0; i < count; i++) dsa_tag_driver_unregister(dsa_tag_driver_array[i]); } EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) { return ops->name; }; /* Function takes a reference on the module owning the tagger, * so dsa_tag_driver_put must be called afterwards. */ const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name) { const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); struct dsa_tag_driver *dsa_tag_driver; request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { const struct dsa_device_ops *tmp = dsa_tag_driver->ops; if (strcmp(name, tmp->name)) continue; if (!try_module_get(dsa_tag_driver->owner)) break; ops = tmp; break; } mutex_unlock(&dsa_tag_drivers_lock); return ops; } const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol) { struct dsa_tag_driver *dsa_tag_driver; const struct dsa_device_ops *ops; bool found = false; request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { ops = dsa_tag_driver->ops; if (ops->proto == tag_protocol) { found = true; break; } } if (found) { if (!try_module_get(dsa_tag_driver->owner)) ops = ERR_PTR(-ENOPROTOOPT); } else { ops = ERR_PTR(-ENOPROTOOPT); } mutex_unlock(&dsa_tag_drivers_lock); return ops; } void dsa_tag_driver_put(const struct dsa_device_ops *ops) { struct dsa_tag_driver *dsa_tag_driver; mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { if (dsa_tag_driver->ops == ops) { module_put(dsa_tag_driver->owner); break; } } mutex_unlock(&dsa_tag_drivers_lock); }
2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include "rds.h" #include "ib.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", "ib_evt_handler_call", "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", "ib_tx_throttle", "ib_tx_sg_mapping_failure", "ib_tx_stalled", "ib_tx_credit_updates", "ib_rx_cq_event", "ib_rx_ring_empty", "ib_rx_refill_from_cq", "ib_rx_refill_from_thread", "ib_rx_alloc_limit", "ib_rx_total_frags", "ib_rx_total_incs", "ib_rx_credit_updates", "ib_ack_sent", "ib_ack_send_failure", "ib_ack_send_delayed", "ib_ack_send_piggybacked", "ib_ack_received", "ib_rdma_mr_8k_alloc", "ib_rdma_mr_8k_free", "ib_rdma_mr_8k_used", "ib_rdma_mr_8k_pool_flush", "ib_rdma_mr_8k_pool_wait", "ib_rdma_mr_8k_pool_depleted", "ib_rdma_mr_1m_alloc", "ib_rdma_mr_1m_free", "ib_rdma_mr_1m_used", "ib_rdma_mr_1m_pool_flush", "ib_rdma_mr_1m_pool_wait", "ib_rdma_mr_1m_pool_depleted", "ib_rdma_mr_8k_reused", "ib_rdma_mr_1m_reused", "ib_atomic_cswp", "ib_atomic_fadd", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_ib_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; if (avail < ARRAY_SIZE(rds_ib_stat_names)) goto out; for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, ARRAY_SIZE(rds_ib_stat_names)); out: return ARRAY_SIZE(rds_ib_stat_names); }
33 1 1 1 1 1 25 3 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 // SPDX-License-Identifier: GPL-2.0-only /* * iptables module to match inet_addr_type() of an ip. * * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/ip.h> #include <net/route.h> #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip6_fib.h> #endif #include <linux/netfilter_ipv6.h> #include <linux/netfilter/xt_addrtype.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: address type match"); MODULE_ALIAS("ipt_addrtype"); MODULE_ALIAS("ip6t_addrtype"); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, const struct in6_addr *addr, u16 mask) { struct flowi6 flow; struct rt6_info *rt; u32 ret = 0; int route_err; memset(&flow, 0, sizeof(flow)); flow.daddr = *addr; if (dev) flow.flowi6_oif = dev->ifindex; if (dev && (mask & XT_ADDRTYPE_LOCAL)) { if (nf_ipv6_chk_addr(net, addr, dev, true)) ret = XT_ADDRTYPE_LOCAL; } route_err = nf_ip6_route(net, (struct dst_entry **)&rt, flowi6_to_flowi(&flow), false); if (route_err) return XT_ADDRTYPE_UNREACHABLE; if (rt->rt6i_flags & RTF_REJECT) ret = XT_ADDRTYPE_UNREACHABLE; if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; if (ipv6_anycast_destination((struct dst_entry *)rt, addr)) ret |= XT_ADDRTYPE_ANYCAST; dst_release(&rt->dst); return ret; } static bool match_type6(struct net *net, const struct net_device *dev, const struct in6_addr *addr, u16 mask) { int addr_type = ipv6_addr_type(addr); if ((mask & XT_ADDRTYPE_MULTICAST) && !(addr_type & IPV6_ADDR_MULTICAST)) return false; if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST)) return false; if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY) return false; if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | XT_ADDRTYPE_UNREACHABLE) & mask) return !!(mask & match_lookup_rt6(net, dev, addr, mask)); return true; } static bool addrtype_mt6(struct net *net, const struct net_device *dev, const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool ret = true; if (info->source) ret &= match_type6(net, dev, &iph->saddr, info->source) ^ (info->flags & XT_ADDRTYPE_INVERT_SOURCE); if (ret && info->dest) ret &= match_type6(net, dev, &iph->daddr, info->dest) ^ !!(info->flags & XT_ADDRTYPE_INVERT_DEST); return ret; } #endif static inline bool match_type(struct net *net, const struct net_device *dev, __be32 addr, u_int16_t mask) { return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); } static bool addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; if (info->source) ret &= match_type(net, NULL, iph->saddr, info->source) ^ info->invert_source; if (info->dest) ret &= match_type(net, NULL, iph->daddr, info->dest) ^ info->invert_dest; return ret; } static bool addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph; const struct net_device *dev = NULL; bool ret = true; if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) dev = xt_in(par); else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) dev = xt_out(par); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (xt_family(par) == NFPROTO_IPV6) return addrtype_mt6(net, dev, skb, info); #endif iph = ip_hdr(skb); if (info->source) ret &= match_type(net, dev, iph->saddr, info->source) ^ (info->flags & XT_ADDRTYPE_INVERT_SOURCE); if (ret && info->dest) ret &= match_type(net, dev, iph->daddr, info->dest) ^ !!(info->flags & XT_ADDRTYPE_INVERT_DEST); return ret; } static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) { const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; struct xt_addrtype_info_v1 *info = par->matchinfo; if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) goto err; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { errmsg = "output interface limitation not valid in PREROUTING and INPUT"; goto err; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { errmsg = "input interface limitation not valid in POSTROUTING and OUTPUT"; goto err; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (par->family == NFPROTO_IPV6) { if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { errmsg = "ipv6 BLACKHOLE matching not supported"; goto err; } if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { errmsg = "ipv6 PROHIBIT (THROW, NAT ..) matching not supported"; goto err; } if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { errmsg = "ipv6 does not support BROADCAST matching"; goto err; } } #endif return 0; err: pr_info_ratelimited("%s\n", errmsg); return -EINVAL; } static struct xt_match addrtype_mt_reg[] __read_mostly = { { .name = "addrtype", .family = NFPROTO_IPV4, .match = addrtype_mt_v0, .matchsize = sizeof(struct xt_addrtype_info), .me = THIS_MODULE }, { .name = "addrtype", .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "addrtype", .family = NFPROTO_IPV6, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE }, #endif }; static int __init addrtype_mt_init(void) { return xt_register_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); } static void __exit addrtype_mt_exit(void) { xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); } module_init(addrtype_mt_init); module_exit(addrtype_mt_exit);
10 10 10 10 10 6 2 2 4 1 2 1 7 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003-2006 Helsinki University of Technology * Copyright (C)2003-2006 USAGI/WIDE Project */ /* * Authors: * Noriaki TAKAMIYA @USAGI * Masahide NAKAMURA @USAGI */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/time.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <net/rawv6.h> #include <net/xfrm.h> #include <net/mip6.h> static inline unsigned int calc_padlen(unsigned int len, unsigned int n) { return (n - len + 16) & 0x7; } static inline void *mip6_padn(__u8 *data, __u8 padlen) { if (!data) return NULL; if (padlen == 1) { data[0] = IPV6_TLV_PAD1; } else if (padlen > 1) { data[0] = IPV6_TLV_PADN; data[1] = padlen - 2; if (padlen > 2) memset(data+2, 0, data[1]); } return data + padlen; } static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos) { icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); } static int mip6_mh_len(int type) { int len = 0; switch (type) { case IP6_MH_TYPE_BRR: len = 0; break; case IP6_MH_TYPE_HOTI: case IP6_MH_TYPE_COTI: case IP6_MH_TYPE_BU: case IP6_MH_TYPE_BACK: len = 1; break; case IP6_MH_TYPE_HOT: case IP6_MH_TYPE_COT: case IP6_MH_TYPE_BERROR: len = 2; break; } return len; } static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) { struct ip6_mh _hdr; const struct ip6_mh *mh; mh = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_hdr), &_hdr); if (!mh) return -1; if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len) return -1; if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { net_dbg_ratelimited("mip6: MH message too short: %d vs >=%d\n", mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + skb_network_header_len(skb)); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { net_dbg_ratelimited("mip6: MH invalid payload proto = %d\n", mh->ip6mh_proto); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + skb_network_header_len(skb)); return -1; } return 0; } struct mip6_report_rate_limiter { spinlock_t lock; ktime_t stamp; int iif; struct in6_addr src; struct in6_addr dst; }; static struct mip6_report_rate_limiter mip6_report_rl = { .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock) }; static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; int err = destopt->nexthdr; spin_lock(&x->lock); if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); return err; } /* Destination Option Header is inserted. * IP Header's src address is replaced with Home Address Option in * Destination Option Header. */ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *iph; struct ipv6_destopt_hdr *dstopt; struct ipv6_destopt_hao *hao; u8 nexthdr; int len; skb_push(skb, -skb_network_offset(skb)); iph = ipv6_hdr(skb); nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_DSTOPTS; dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb); dstopt->nexthdr = nexthdr; hao = mip6_padn((char *)(dstopt + 1), calc_padlen(sizeof(*dstopt), 6)); hao->type = IPV6_TLV_HAO; BUILD_BUG_ON(sizeof(*hao) != 18); hao->length = sizeof(*hao) - 2; len = ((char *)hao - (char *)dstopt) + sizeof(*hao); memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr)); spin_lock_bh(&x->lock); memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr)); spin_unlock_bh(&x->lock); WARN_ON(len != x->props.header_len); dstopt->hdrlen = (x->props.header_len >> 3) - 1; return 0; } static inline int mip6_report_rl_allow(ktime_t stamp, const struct in6_addr *dst, const struct in6_addr *src, int iif) { int allow = 0; spin_lock_bh(&mip6_report_rl.lock); if (mip6_report_rl.stamp != stamp || mip6_report_rl.iif != iif || !ipv6_addr_equal(&mip6_report_rl.src, src) || !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { mip6_report_rl.stamp = stamp; mip6_report_rl.iif = iif; mip6_report_rl.src = *src; mip6_report_rl.dst = *dst; allow = 1; } spin_unlock_bh(&mip6_report_rl.lock); return allow; } static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, const struct flowi *fl) { struct net *net = xs_net(x); struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; const struct flowi6 *fl6 = &fl->u.ip6; struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; ktime_t stamp; int err = 0; if (unlikely(fl6->flowi6_proto == IPPROTO_MH && fl6->fl6_mh_type <= IP6_MH_TYPE_MAX)) goto out; if (likely(opt->dsthao)) { offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(offset >= 0)) hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + offset); } stamp = skb_get_ktime(skb); if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr, hao ? &hao->addr : &ipv6_hdr(skb)->saddr, opt->iif)) goto out; memset(&sel, 0, sizeof(sel)); memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, sizeof(sel.daddr)); sel.prefixlen_d = 128; memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, sizeof(sel.saddr)); sel.prefixlen_s = 128; sel.family = AF_INET6; sel.proto = fl6->flowi6_proto; sel.dport = xfrm_flowi_dport(fl, &fl6->uli); if (sel.dport) sel.dport_mask = htons(~0); sel.sport = xfrm_flowi_sport(fl, &fl6->uli); if (sel.sport) sel.sport_mask = htons(~0); sel.ifindex = fl6->flowi6_oif; err = km_report(net, IPPROTO_DSTOPTS, &sel, (hao ? (xfrm_address_t *)&hao->addr : NULL)); out: return err; } static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } x->props.header_len = sizeof(struct ipv6_destopt_hdr) + calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) + sizeof(struct ipv6_destopt_hao); WARN_ON(x->props.header_len != 24); return 0; } /* * Do nothing about destroying since it has no specific operation for * destination options header unlike IPsec protocols. */ static void mip6_destopt_destroy(struct xfrm_state *x) { } static const struct xfrm_type mip6_destopt_type = { .owner = THIS_MODULE, .proto = IPPROTO_DSTOPTS, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, .init_state = mip6_destopt_init_state, .destructor = mip6_destopt_destroy, .input = mip6_destopt_input, .output = mip6_destopt_output, .reject = mip6_destopt_reject, }; static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; int err = rt2->rt_hdr.nexthdr; spin_lock(&x->lock); if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); return err; } /* Routing Header type 2 is inserted. * IP Header's dst address is replaced with Routing Header's Home Address. */ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *iph; struct rt2_hdr *rt2; u8 nexthdr; skb_push(skb, -skb_network_offset(skb)); iph = ipv6_hdr(skb); nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_ROUTING; rt2 = (struct rt2_hdr *)skb_transport_header(skb); rt2->rt_hdr.nexthdr = nexthdr; rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1; rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2; rt2->rt_hdr.segments_left = 1; memset(&rt2->reserved, 0, sizeof(rt2->reserved)); WARN_ON(rt2->rt_hdr.hdrlen != 2); memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr)); spin_lock_bh(&x->lock); memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr)); spin_unlock_bh(&x->lock); return 0; } static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } x->props.header_len = sizeof(struct rt2_hdr); return 0; } /* * Do nothing about destroying since it has no specific operation for routing * header type 2 unlike IPsec protocols. */ static void mip6_rthdr_destroy(struct xfrm_state *x) { } static const struct xfrm_type mip6_rthdr_type = { .owner = THIS_MODULE, .proto = IPPROTO_ROUTING, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, .init_state = mip6_rthdr_init_state, .destructor = mip6_rthdr_destroy, .input = mip6_rthdr_input, .output = mip6_rthdr_output, }; static int __init mip6_init(void) { pr_info("Mobile IPv6\n"); if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type(destopt)\n", __func__); goto mip6_destopt_xfrm_fail; } if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type(rthdr)\n", __func__); goto mip6_rthdr_xfrm_fail; } if (rawv6_mh_filter_register(mip6_mh_filter) < 0) { pr_info("%s: can't add rawv6 mh filter\n", __func__); goto mip6_rawv6_mh_fail; } return 0; mip6_rawv6_mh_fail: xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); mip6_rthdr_xfrm_fail: xfrm_unregister_type(&mip6_destopt_type, AF_INET6); mip6_destopt_xfrm_fail: return -EAGAIN; } static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) pr_info("%s: can't remove rawv6 mh filter\n", __func__); xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); xfrm_unregister_type(&mip6_destopt_type, AF_INET6); } module_init(mip6_init); module_exit(mip6_fini); MODULE_DESCRIPTION("IPv6 Mobility driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING);
140 140 4 2 1 1 5 1 1 1 2 3 2 1 499 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0 /* * Management Component Transport Protocol (MCTP) - routing * implementation. * * This is currently based on a simple routing table, with no dst cache. The * number of routes should stay fairly small, so the lookup cost is small. * * Copyright (c) 2021 Code Construct * Copyright (c) 2021 Google */ #include <linux/idr.h> #include <linux/mctp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/mctp.h> #include <net/mctpdevice.h> #include <net/netlink.h> #include <net/sock.h> static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, enum mctp_neigh_source source, size_t lladdr_len, const void *lladdr) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh; int rc; mutex_lock(&net->mctp.neigh_lock); if (mctp_neigh_lookup(mdev, eid, NULL) == 0) { rc = -EEXIST; goto out; } if (lladdr_len > sizeof(neigh->ha)) { rc = -EINVAL; goto out; } neigh = kzalloc(sizeof(*neigh), GFP_KERNEL); if (!neigh) { rc = -ENOMEM; goto out; } INIT_LIST_HEAD(&neigh->list); neigh->dev = mdev; mctp_dev_hold(neigh->dev); neigh->eid = eid; neigh->source = source; memcpy(neigh->ha, lladdr, lladdr_len); list_add_rcu(&neigh->list, &net->mctp.neighbours); rc = 0; out: mutex_unlock(&net->mctp.neigh_lock); return rc; } static void __mctp_neigh_free(struct rcu_head *rcu) { struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu); mctp_dev_put(neigh->dev); kfree(neigh); } /* Removes all neighbour entries referring to a device */ void mctp_neigh_remove_dev(struct mctp_dev *mdev) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh, *tmp; mutex_lock(&net->mctp.neigh_lock); list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { if (neigh->dev == mdev) { list_del_rcu(&neigh->list); /* TODO: immediate RTM_DELNEIGH */ call_rcu(&neigh->rcu, __mctp_neigh_free); } } mutex_unlock(&net->mctp.neigh_lock); } static int mctp_neigh_remove(struct mctp_dev *mdev, mctp_eid_t eid, enum mctp_neigh_source source) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh, *tmp; bool dropped = false; mutex_lock(&net->mctp.neigh_lock); list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { if (neigh->dev == mdev && neigh->eid == eid && neigh->source == source) { list_del_rcu(&neigh->list); /* TODO: immediate RTM_DELNEIGH */ call_rcu(&neigh->rcu, __mctp_neigh_free); dropped = true; } } mutex_unlock(&net->mctp.neigh_lock); return dropped ? 0 : -ENOENT; } static const struct nla_policy nd_mctp_policy[NDA_MAX + 1] = { [NDA_DST] = { .type = NLA_U8 }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, }; static int mctp_rtm_newneigh(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev; struct mctp_dev *mdev; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX + 1]; int rc; mctp_eid_t eid; void *lladdr; int lladdr_len; rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, extack); if (rc < 0) { NL_SET_ERR_MSG(extack, "lladdr too large?"); return rc; } if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); return -EINVAL; } if (!tb[NDA_LLADDR]) { NL_SET_ERR_MSG(extack, "Neighbour lladdr must be specified"); return -EINVAL; } eid = nla_get_u8(tb[NDA_DST]); if (!mctp_address_unicast(eid)) { NL_SET_ERR_MSG(extack, "Invalid neighbour EID"); return -EINVAL; } lladdr = nla_data(tb[NDA_LLADDR]); lladdr_len = nla_len(tb[NDA_LLADDR]); ndm = nlmsg_data(nlh); dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) return -ENODEV; mdev = mctp_dev_get_rtnl(dev); if (!mdev) return -ENODEV; if (lladdr_len != dev->addr_len) { NL_SET_ERR_MSG(extack, "Wrong lladdr length"); return -EINVAL; } return mctp_neigh_add(mdev, eid, MCTP_NEIGH_STATIC, lladdr_len, lladdr); } static int mctp_rtm_delneigh(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev; struct mctp_dev *mdev; struct ndmsg *ndm; int rc; mctp_eid_t eid; rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, extack); if (rc < 0) { NL_SET_ERR_MSG(extack, "incorrect format"); return rc; } if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); return -EINVAL; } eid = nla_get_u8(tb[NDA_DST]); ndm = nlmsg_data(nlh); dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) return -ENODEV; mdev = mctp_dev_get_rtnl(dev); if (!mdev) return -ENODEV; return mctp_neigh_remove(mdev, eid, MCTP_NEIGH_STATIC); } static int mctp_fill_neigh(struct sk_buff *skb, u32 portid, u32 seq, int event, unsigned int flags, struct mctp_neigh *neigh) { struct net_device *dev = neigh->dev->dev; struct nlmsghdr *nlh; struct ndmsg *hdr; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); if (!nlh) return -EMSGSIZE; hdr = nlmsg_data(nlh); hdr->ndm_family = AF_MCTP; hdr->ndm_ifindex = dev->ifindex; hdr->ndm_state = 0; // TODO other state bits? if (neigh->source == MCTP_NEIGH_STATIC) hdr->ndm_state |= NUD_PERMANENT; hdr->ndm_flags = 0; hdr->ndm_type = RTN_UNICAST; // TODO: is loopback RTN_LOCAL? if (nla_put_u8(skb, NDA_DST, neigh->eid)) goto cancel; if (nla_put(skb, NDA_LLADDR, dev->addr_len, neigh->ha)) goto cancel; nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int rc, idx, req_ifindex; struct mctp_neigh *neigh; struct ndmsg *ndmsg; struct { int idx; } *cbctx = (void *)cb->ctx; ndmsg = nlmsg_payload(cb->nlh, sizeof(*ndmsg)); if (!ndmsg) return -EINVAL; req_ifindex = ndmsg->ndm_ifindex; idx = 0; rcu_read_lock(); list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { if (idx < cbctx->idx) goto cont; rc = 0; if (req_ifindex == 0 || req_ifindex == neigh->dev->dev->ifindex) rc = mctp_fill_neigh(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, neigh); if (rc) break; cont: idx++; } rcu_read_unlock(); cbctx->idx = idx; return skb->len; } int mctp_neigh_lookup(struct mctp_dev *mdev, mctp_eid_t eid, void *ret_hwaddr) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh; int rc = -EHOSTUNREACH; // TODO: or ENOENT? rcu_read_lock(); list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { if (mdev == neigh->dev && eid == neigh->eid) { if (ret_hwaddr) memcpy(ret_hwaddr, neigh->ha, sizeof(neigh->ha)); rc = 0; break; } } rcu_read_unlock(); return rc; } /* namespace registration */ static int __net_init mctp_neigh_net_init(struct net *net) { struct netns_mctp *ns = &net->mctp; INIT_LIST_HEAD(&ns->neighbours); mutex_init(&ns->neigh_lock); return 0; } static void __net_exit mctp_neigh_net_exit(struct net *net) { struct netns_mctp *ns = &net->mctp; struct mctp_neigh *neigh; list_for_each_entry(neigh, &ns->neighbours, list) call_rcu(&neigh->rcu, __mctp_neigh_free); } /* net namespace implementation */ static struct pernet_operations mctp_net_ops = { .init = mctp_neigh_net_init, .exit = mctp_neigh_net_exit, }; static const struct rtnl_msg_handler mctp_neigh_rtnl_msg_handlers[] = { {THIS_MODULE, PF_MCTP, RTM_NEWNEIGH, mctp_rtm_newneigh, NULL, 0}, {THIS_MODULE, PF_MCTP, RTM_DELNEIGH, mctp_rtm_delneigh, NULL, 0}, {THIS_MODULE, PF_MCTP, RTM_GETNEIGH, NULL, mctp_rtm_getneigh, 0}, }; int __init mctp_neigh_init(void) { int err; err = register_pernet_subsys(&mctp_net_ops); if (err) return err; err = rtnl_register_many(mctp_neigh_rtnl_msg_handlers); if (err) unregister_pernet_subsys(&mctp_net_ops); return err; } void mctp_neigh_exit(void) { rtnl_unregister_many(mctp_neigh_rtnl_msg_handlers); unregister_pernet_subsys(&mctp_net_ops); }
8 3 3 15 6 6 13 7 6 6 6 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation -- HMAC functions * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> #include <linux/slab.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <crypto/hash.h> #include <net/seg6.h> #include <net/genetlink.h> #include <net/seg6_hmac.h> #include <linux/random.h> struct hmac_storage { local_lock_t bh_lock; char hmac_ring[SEG6_HMAC_RING_SIZE]; }; static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct seg6_hmac_info *hinfo = obj; return (hinfo->hmackeyid != *(__u32 *)arg->key); } static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo) { kfree_rcu(hinfo, rcu); } static void seg6_free_hi(void *ptr, void *arg) { struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr; if (hinfo) seg6_hinfo_release(hinfo); } static const struct rhashtable_params rht_params = { .head_offset = offsetof(struct seg6_hmac_info, node), .key_offset = offsetof(struct seg6_hmac_info, hmackeyid), .key_len = sizeof(u32), .automatic_shrinking = true, .obj_cmpfn = seg6_hmac_cmpfn, }; static struct seg6_hmac_algo hmac_algos[] = { { .alg_id = SEG6_HMAC_ALGO_SHA1, .name = "hmac(sha1)", }, { .alg_id = SEG6_HMAC_ALGO_SHA256, .name = "hmac(sha256)", }, }; static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5) return NULL; if (!sr_has_hmac(srh)) return NULL; tlv = (struct sr6_tlv_hmac *) ((char *)srh + ((srh->hdrlen + 1) << 3) - 40); if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38) return NULL; return tlv; } static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) { struct seg6_hmac_algo *algo; int i, alg_count; alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; if (algo->alg_id == alg_id) return algo; } return NULL; } static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, u8 *output, int outlen) { struct seg6_hmac_algo *algo; struct crypto_shash *tfm; struct shash_desc *shash; int ret, dgsize; algo = __hmac_get_algo(hinfo->alg_id); if (!algo) return -ENOENT; tfm = *this_cpu_ptr(algo->tfms); dgsize = crypto_shash_digestsize(tfm); if (dgsize > outlen) { pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", dgsize, outlen); return -ENOMEM; } ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); if (ret < 0) { pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); goto failed; } shash = *this_cpu_ptr(algo->shashs); shash->tfm = tfm; ret = crypto_shash_digest(shash, text, psize, output); if (ret < 0) { pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); goto failed; } return dgsize; failed: return ret; } int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; int plen, i, dgsize, wrsize; char *ring, *off; /* a 160-byte buffer for digest output allows to store highest known * hash function (RadioGatun) with up to 1216 bits */ /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; /* this limit allows for 14 segments */ if (plen >= SEG6_HMAC_RING_SIZE) return -EMSGSIZE; /* Let's build the HMAC text on the ring buffer. The text is composed * as follows, in order: * * 1. Source IPv6 address (128 bits) * 2. first_segment value (8 bits) * 3. Flags (8 bits) * 4. HMAC Key ID (32 bits) * 5. All segments in the segments list (n * 128 bits) */ local_bh_disable(); local_lock_nested_bh(&hmac_storage.bh_lock); ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ memcpy(off, saddr, 16); off += 16; /* first_segment value */ *off++ = hdr->first_segment; /* flags */ *off++ = hdr->flags; /* HMAC Key ID */ memcpy(off, &hmackeyid, 4); off += 4; /* all segments in the list */ for (i = 0; i < hdr->first_segment + 1; i++) { memcpy(off, hdr->segments + i, 16); off += 16; } dgsize = __do_hmac(hinfo, ring, plen, tmp_out, SEG6_HMAC_MAX_DIGESTSIZE); local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); if (dgsize < 0) return dgsize; wrsize = SEG6_HMAC_FIELD_LEN; if (wrsize > dgsize) wrsize = dgsize; memset(output, 0, SEG6_HMAC_FIELD_LEN); memcpy(output, tmp_out, wrsize); return 0; } EXPORT_SYMBOL(seg6_hmac_compute); /* checks if an incoming SR-enabled packet's HMAC status matches * the incoming policy. * * called with rcu_read_lock() */ bool seg6_hmac_validate_skb(struct sk_buff *skb) { u8 hmac_output[SEG6_HMAC_FIELD_LEN]; struct net *net = dev_net(skb->dev); struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; struct ipv6_sr_hdr *srh; struct inet6_dev *idev; int require_hmac; idev = __in6_dev_get(skb->dev); srh = (struct ipv6_sr_hdr *)skb_transport_header(skb); tlv = seg6_get_tlv_hmac(srh); require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac); /* mandatory check but no tlv */ if (require_hmac > 0 && !tlv) return false; /* no check */ if (require_hmac < 0) return true; /* check only if present */ if (require_hmac == 0 && !tlv) return true; /* now, seg6_require_hmac >= 0 && tlv */ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) return false; if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0) return false; return true; } EXPORT_SYMBOL(seg6_hmac_validate_skb); /* called with rcu_read_lock() */ struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); return hinfo; } EXPORT_SYMBOL(seg6_hmac_info_lookup); int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) { struct seg6_pernet_data *sdata = seg6_pernet(net); int err; err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); return err; } EXPORT_SYMBOL(seg6_hmac_info_add); int seg6_hmac_info_del(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; int err = -ENOENT; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); if (!hinfo) goto out; err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node, rht_params); if (err) goto out; seg6_hinfo_release(hinfo); out: return err; } EXPORT_SYMBOL(seg6_hmac_info_del); int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh) { struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; int err = -ENOENT; tlv = seg6_get_tlv_hmac(srh); if (!tlv) return -EINVAL; rcu_read_lock(); hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) goto out; memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN); err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac); out: rcu_read_unlock(); return err; } EXPORT_SYMBOL(seg6_push_hmac); static int seg6_hmac_init_algo(void) { struct seg6_hmac_algo *algo; struct crypto_shash *tfm; struct shash_desc *shash; int i, alg_count, cpu; int ret = -ENOMEM; alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { struct crypto_shash **p_tfm; int shsize; algo = &hmac_algos[i]; algo->tfms = alloc_percpu(struct crypto_shash *); if (!algo->tfms) goto error_out; for_each_possible_cpu(cpu) { tfm = crypto_alloc_shash(algo->name, 0, 0); if (IS_ERR(tfm)) { ret = PTR_ERR(tfm); goto error_out; } p_tfm = per_cpu_ptr(algo->tfms, cpu); *p_tfm = tfm; } p_tfm = raw_cpu_ptr(algo->tfms); tfm = *p_tfm; shsize = sizeof(*shash) + crypto_shash_descsize(tfm); algo->shashs = alloc_percpu(struct shash_desc *); if (!algo->shashs) goto error_out; for_each_possible_cpu(cpu) { shash = kzalloc_node(shsize, GFP_KERNEL, cpu_to_node(cpu)); if (!shash) goto error_out; *per_cpu_ptr(algo->shashs, cpu) = shash; } } return 0; error_out: seg6_hmac_exit(); return ret; } int __init seg6_hmac_init(void) { return seg6_hmac_init_algo(); } int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); return rhashtable_init(&sdata->hmac_infos, &rht_params); } void seg6_hmac_exit(void) { struct seg6_hmac_algo *algo = NULL; struct crypto_shash *tfm; struct shash_desc *shash; int i, alg_count, cpu; alg_count = ARRAY_SIZE(hmac_algos); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; if (algo->shashs) { for_each_possible_cpu(cpu) { shash = *per_cpu_ptr(algo->shashs, cpu); kfree(shash); } free_percpu(algo->shashs); } if (algo->tfms) { for_each_possible_cpu(cpu) { tfm = *per_cpu_ptr(algo->tfms, cpu); crypto_free_shash(tfm); } free_percpu(algo->tfms); } } } EXPORT_SYMBOL(seg6_hmac_exit); void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL); } EXPORT_SYMBOL(seg6_hmac_net_exit);
6 6 6 41 6 9 52 159 212 3 206 279 82 4 203 191 15 164 11 10 130 7 34 31 3 62 99 68 118 8 109 1 13 3 7 164 164 7 156 1 16 18 2 102 11 8 11 11 170 169 150 10 130 51 140 17 17 12 5 304 2 17 303 21 283 75 41 41 211 212 212 338 316 21 336 41 8 33 115 3 210 15 15 198 198 229 126 237 238 310 40 272 261 261 129 156 13 1 1 3 19 20 7 21 12 31 31 32 32 1 7 1 1 1 4 84 1 1 82 77 23 59 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/hashtable.h> #include <linux/io_uring.h> #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "alloc_cache.h" #include "refs.h" #include "napi.h" #include "opdef.h" #include "kbuf.h" #include "poll.h" #include "cancel.h" struct io_poll_update { struct file *file; u64 old_user_data; u64 new_user_data; __poll_t events; bool update_events; bool update_user_data; }; struct io_poll_table { struct poll_table_struct pt; struct io_kiocb *req; int nr_entries; int error; bool owning; /* output value, set only if arm poll returns >0 */ __poll_t result_mask; }; #define IO_POLL_CANCEL_FLAG BIT(31) #define IO_POLL_RETRY_FLAG BIT(30) #define IO_POLL_REF_MASK GENMASK(29, 0) /* * We usually have 1-2 refs taken, 128 is more than enough and we want to * maximise the margin between this amount and the moment when it overflows. */ #define IO_POLL_REF_BIAS 128 #define IO_WQE_F_DOUBLE 1 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key); static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE); } static inline bool wqe_is_double(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; return priv & IO_WQE_F_DOUBLE; } static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) { int v; /* * poll_refs are already elevated and we don't have much hope for * grabbing the ownership. Instead of incrementing set a retry flag * to notify the loop that there might have been some change. */ v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs); if (v & IO_POLL_REF_MASK) return false; return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can * bump it and acquire ownership. It's disallowed to modify requests while not * owning it, that prevents from races for enqueueing task_work's and b/w * arming poll and wakeups. */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } static void io_poll_mark_cancelled(struct io_kiocb *req) { atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); } static struct io_poll *io_poll_get_double(struct io_kiocb *req) { /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) return req->async_data; return req->apoll->double_poll; } static struct io_poll *io_poll_get_single(struct io_kiocb *req) { if (req->opcode == IORING_OP_POLL_ADD) return io_kiocb_to_cmd(req, struct io_poll); return &req->apoll->poll; } static void io_poll_req_insert(struct io_kiocb *req) { struct io_hash_table *table = &req->ctx->cancel_table; u32 index = hash_long(req->cqe.user_data, table->hash_bits); lockdep_assert_held(&req->ctx->uring_lock); hlist_add_head(&req->hash_node, &table->hbs[index].list); } static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) /* mask in events that we always want/need */ poll->events = events | IO_POLL_UNMASK; INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); } static inline void io_poll_remove_entry(struct io_poll *poll) { struct wait_queue_head *head = smp_load_acquire(&poll->head); if (head) { spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); poll->head = NULL; spin_unlock_irq(&head->lock); } } static void io_poll_remove_entries(struct io_kiocb *req) { /* * Nothing to do if neither of those flags are set. Avoid dipping * into the poll/apoll/double cachelines if we can. */ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) return; /* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); if (req->flags & REQ_F_SINGLE_POLL) io_poll_remove_entry(io_poll_get_single(req)); if (req->flags & REQ_F_DOUBLE_POLL) io_poll_remove_entry(io_poll_get_double(req)); rcu_read_unlock(); } enum { IOU_POLL_DONE = 0, IOU_POLL_NO_ACTION = 1, IOU_POLL_REMOVE_POLL_USE_RES = 2, IOU_POLL_REISSUE = 3, IOU_POLL_REQUEUE = 4, }; static void __io_poll_execute(struct io_kiocb *req, int mask) { unsigned flags = 0; io_req_set_res(req, mask, 0); req->io_task_work.func = io_poll_task_func; trace_io_uring_task_add(req, mask); if (!(req->flags & REQ_F_POLL_NO_LAZY)) flags = IOU_F_TWQ_LAZY_WAKE; __io_req_task_work_add(req, flags); } static inline void io_poll_execute(struct io_kiocb *req, int res) { if (io_poll_get_ownership(req)) __io_poll_execute(req, res); } /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action * require, which is either spurious wakeup or multishot CQE is served. * IOU_POLL_DONE when it's done with the request, then the mask is stored in * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; if (unlikely(io_should_terminate_tw())) return -ECANCELED; do { v = atomic_read(&req->poll_refs); if (unlikely(v != 1)) { /* tw should be the owner and so have some refs */ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) return IOU_POLL_NO_ACTION; if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; /* * cqe.res contains only events of the first wake up * and all others are to be lost. Redo vfs_poll() to get * up to date state. */ if ((v & IO_POLL_REF_MASK) != 1) req->cqe.res = 0; if (v & IO_POLL_RETRY_FLAG) { req->cqe.res = 0; /* * We won't find new events that came in between * vfs_poll and the ref put unless we clear the * flag in advance. */ atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); v &= ~IO_POLL_RETRY_FLAG; } } /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; /* * We got woken with a mask, but someone else got to * it first. The above vfs_poll() doesn't add us back * to the waitqueue, so if we get nothing back, we * should be safe and attempt a reissue. */ if (unlikely(!req->cqe.res)) { /* Multishot armed need not reissue */ if (!(req->apoll_events & EPOLLONESHOT)) continue; return IOU_POLL_REISSUE; } } if (unlikely(req->cqe.res & EPOLLERR)) req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; /* multishot, just fill a CQE and proceed */ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } } else { int ret = io_poll_issue(req, tw); if (ret == IOU_COMPLETE) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; if (ret != IOU_RETRY && ret < 0) return ret; } /* force the next iteration to vfs_poll() */ req->cqe.res = 0; /* * Release all references, retry if someone tried to restart * task_work while we were executing it. */ v &= IO_POLL_REF_MASK; } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); io_napi_add(req); return IOU_POLL_NO_ACTION; } void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) { int ret; ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { io_kbuf_recycle(req, 0); return; } else if (ret == IOU_POLL_REQUEUE) { io_kbuf_recycle(req, 0); __io_poll_execute(req, 0); return; } io_poll_remove_entries(req); /* task_work always has ->uring_lock held */ hash_del(&req->hash_node); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { struct io_poll *poll; poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { io_req_task_submit(req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; req_set_fail(req); } io_req_set_res(req, req->cqe.res, 0); io_req_task_complete(req, tw); } else { io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) io_req_task_complete(req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) io_req_task_submit(req, tw); else io_req_defer_failed(req, ret); } } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ io_poll_execute(req, 0); } #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ io_poll_execute(req, 0); /* * If the waitqueue is being freed early but someone is already * holds ownership over it, we have to tear down the request as * best we can. That means immediately removing the request from * its waitqueue and preventing all further accesses to the * waitqueue via the request. */ list_del_init(&poll->wait.entry); /* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock. */ smp_store_release(&poll->head, NULL); return 1; } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wqe_to_req(wait); struct io_poll *poll = container_of(wait, struct io_poll, wait); __poll_t mask = key_to_poll(key); if (unlikely(mask & POLLFREE)) return io_pollfree_wake(req, poll); /* for instances that support it check for an event match first */ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) return 0; if (io_poll_get_ownership(req)) { /* * If we trigger a multishot poll off our own wakeup path, * disable multishot as there is a circular dependency between * CQ posting and triggering the event. */ if (mask & EPOLL_URING_WAKE) poll->events |= EPOLLONESHOT; /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; if (wqe_is_double(wait)) req->flags &= ~REQ_F_DOUBLE_POLL; else req->flags &= ~REQ_F_SINGLE_POLL; } __io_poll_execute(req, mask); } return 1; } /* fails only when polling is already completing by the first entry */ static bool io_poll_double_prepare(struct io_kiocb *req) { struct wait_queue_head *head; struct io_poll *poll = io_poll_get_single(req); /* head is RCU protected, see io_poll_remove_entries() comments */ rcu_read_lock(); head = smp_load_acquire(&poll->head); /* * poll arm might not hold ownership and so race for req->flags with * io_poll_wake(). There is only one poll entry queued, serialise with * it by taking its head lock. As we're still arming the tw hanlder * is not going to be run, so there are no races with it. */ if (head) { spin_lock_irq(&head->lock); req->flags |= REQ_F_DOUBLE_POLL; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; spin_unlock_irq(&head->lock); } rcu_read_unlock(); return !!head; } static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll **poll_ptr) { struct io_kiocb *req = pt->req; unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling * (e.g. one for read, one for write). Setup a separate io_poll * if this happens. */ if (unlikely(pt->nr_entries)) { struct io_poll *first = poll; /* double add on the same waitqueue head, ignore */ if (first->head == head) return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { if ((*poll_ptr)->head == head) return; pt->error = -EINVAL; return; } poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; return; } /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; io_init_poll_iocb(poll, first->events); if (!io_poll_double_prepare(req)) { /* the request is completing, just back off */ kfree(poll); return; } *poll_ptr = poll; } else { /* fine to modify, there is no poll queued to race with us */ req->flags |= REQ_F_SINGLE_POLL; } pt->nr_entries++; poll->head = head; poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) { add_wait_queue_exclusive(head, &poll->wait); } else { add_wait_queue(head, &poll->wait); } } static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); struct io_poll *poll = io_kiocb_to_cmd(pt->req, struct io_poll); __io_queue_proc(poll, pt, head, (struct io_poll **) &pt->req->async_data); } static bool io_poll_can_finish_inline(struct io_kiocb *req, struct io_poll_table *pt) { return pt->owning || io_poll_get_ownership(req); } static void io_poll_add_hash(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; io_ring_submit_lock(ctx, issue_flags); io_poll_req_insert(req); io_ring_submit_unlock(ctx, issue_flags); } /* * Returns 0 when it's handed over for polling. The caller owns the requests if * it returns non-zero, but otherwise should not touch it. Negative values * contain an error code. When the result is >0, the polling has completed * inline and ipt.result_mask is set to the mask. */ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll *poll, struct io_poll_table *ipt, __poll_t mask, unsigned issue_flags) { INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; ipt->pt._key = mask; ipt->req = req; ipt->error = 0; ipt->nr_entries = 0; /* * Polling is either completed here or via task_work, so if we're in the * task context we're naturally serialised with tw by merit of running * the same task. When it's io-wq, take the ownership to prevent tw * from running. However, when we're in the task context, skip taking * it as an optimisation. * * Note: even though the request won't be completed/freed, without * ownership we still can race with io_poll_wake(). * io_poll_can_finish_inline() tries to deal with that. */ ipt->owning = issue_flags & IO_URING_F_UNLOCKED; atomic_set(&req->poll_refs, (int)ipt->owning); /* * Exclusive waits may only wake a limited amount of entries * rather than all of them, this may interfere with lazy * wake if someone does wait(events > 1). Ensure we don't do * lazy wake for those, as we need to process each one as they * come in. */ if (poll->events & EPOLLEXCLUSIVE) req->flags |= REQ_F_POLL_NO_LAZY; mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { io_poll_remove_entries(req); if (!io_poll_can_finish_inline(req, ipt)) { io_poll_mark_cancelled(req); return 0; } else if (mask && (poll->events & EPOLLET)) { ipt->result_mask = mask; return 1; } return ipt->error ?: -EINVAL; } if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { if (!io_poll_can_finish_inline(req, ipt)) { io_poll_add_hash(req, issue_flags); return 0; } io_poll_remove_entries(req); ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ return 1; } io_poll_add_hash(req, issue_flags); if (mask && (poll->events & EPOLLET) && io_poll_can_finish_inline(req, ipt)) { __io_poll_execute(req, mask); return 0; } io_napi_add(req); if (ipt->owning) { /* * Try to release ownership. If we see a change of state, e.g. * poll was waken up, queue up a tw, it'll deal with it. */ if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1) __io_poll_execute(req, 0); } return 0; } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); struct async_poll *apoll = pt->req->apoll; __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } /* * We can't reliably detect loops in repeated poll triggers and issue * subsequently failing. But rather than fail these immediately, allow a * certain amount of retries before we give up. Given that this condition * should _rarely_ trigger even once, we should be fine with a larger value. */ #define APOLL_MAX_RETRY 128 static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else { if (!(issue_flags & IO_URING_F_UNLOCKED)) apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC); else apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } apoll->double_poll = NULL; req->apoll = apoll; if (unlikely(!--apoll->poll.retries)) return NULL; return apoll; } int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; if (def->pollin) { mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ if (req->flags & REQ_F_CLEAR_POLLIN) mask &= ~EPOLLIN; } else { mask |= EPOLLOUT | EPOLLWRNORM; } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; apoll = io_req_alloc_apoll(req, issue_flags); if (!apoll) return IO_APOLL_ABORTED; req->flags &= ~(REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL); req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; io_kbuf_recycle(req, issue_flags); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; trace_io_uring_poll_arm(req, mask, apoll->poll.events); return IO_APOLL_OK; } /* * Returns true if we found and killed one or more poll requests */ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct hlist_node *tmp; struct io_kiocb *req; bool found = false; int i; lockdep_assert_held(&ctx->uring_lock); for (i = 0; i < nr_buckets; i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { if (io_match_task_safe(req, tctx, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } } } return found; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd) { struct io_kiocb *req; u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits); struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index]; hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { if (io_cancel_match_sequence(req, cd->seq)) continue; } return req; } return NULL; } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct io_kiocb *req; int i; for (i = 0; i < nr_buckets; i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; hlist_for_each_entry(req, &hb->list, hash_node) { if (io_cancel_req_match(req, cd)) return req; } } return NULL; } static int io_poll_disarm(struct io_kiocb *req) { if (!req) return -ENOENT; if (!io_poll_get_ownership(req)) return -EALREADY; io_poll_remove_entries(req); hash_del(&req->hash_node); return 0; } static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | IORING_ASYNC_CANCEL_ANY)) req = io_poll_file_find(ctx, cd); else req = io_poll_find(ctx, false, cd); if (req) { io_poll_cancel_req(req); return 0; } return -ENOENT; } int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags) { int ret; io_ring_submit_lock(ctx, issue_flags); ret = __io_poll_cancel(ctx, cd); io_ring_submit_unlock(ctx, issue_flags); return ret; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, unsigned int flags) { u32 events; events = READ_ONCE(sqe->poll32_events); #ifdef __BIG_ENDIAN events = swahw32(events); #endif if (!(flags & IORING_POLL_ADD_MULTI)) events |= EPOLLONESHOT; if (!(flags & IORING_POLL_ADD_LEVEL)) events |= EPOLLET; return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET)); } int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_update *upd = io_kiocb_to_cmd(req, struct io_poll_update); u32 flags; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | IORING_POLL_ADD_MULTI)) return -EINVAL; /* meaningless without update */ if (flags == IORING_POLL_ADD_MULTI) return -EINVAL; upd->old_user_data = READ_ONCE(sqe->addr); upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; upd->new_user_data = READ_ONCE(sqe->off); if (!upd->update_user_data && upd->new_user_data) return -EINVAL; if (upd->update_events) upd->events = io_poll_parse_events(sqe, flags); else if (sqe->poll32_events) return -EINVAL; return 0; } int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); u32 flags; if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) return -EINVAL; poll->events = io_poll_parse_events(sqe, flags); return 0; } int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); struct io_poll_table ipt; int ret; ipt.pt._qproc = io_poll_queue_proc; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); return IOU_COMPLETE; } return ret ?: IOU_ISSUE_SKIP_COMPLETE; } int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); struct io_ring_ctx *ctx = req->ctx; struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; struct io_kiocb *preq; int ret2, ret = 0; io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd); ret2 = io_poll_disarm(preq); if (ret2) { ret = ret2; goto out; } if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) { ret = -EFAULT; goto out; } if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ if (poll_update->update_events) { struct io_poll *poll = io_kiocb_to_cmd(preq, struct io_poll); poll->events &= ~0xffff; poll->events |= poll_update->events & 0xffff; poll->events |= IO_POLL_UNMASK; } if (poll_update->update_user_data) preq->cqe.user_data = poll_update->new_user_data; ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ if (!ret2 || ret2 == -EIOCBQUEUED) goto out; } req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); preq->io_task_work.func = io_req_task_complete; io_req_task_work_add(preq); out: io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { req_set_fail(req); return ret; } /* complete update request, we're done with it */ io_req_set_res(req, ret, 0); return IOU_COMPLETE; }
111 22 22 22 22 22 22 31 111 97 86 11 49 3 52 4 39 36 100 19 87 2 43 25 56 18 114 120 13 21 22 34 70 36 36 36 48 26 38 26 3 16 38 9 8 1 3 7 7 2 5 7 2 2 47 48 617 604 24 127 84 90 28 62 28 28 51 50 18 18 33 51 51 49 43 7 4 3 47 4 51 15 9 8 5 5 5 55 6 2 1 46 2 2 45 45 4 37 37 1 3 39 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 44 8 2 1 43 1 41 1 1 1 42 42 43 57 57 11 46 52 530 9 523 19 511 530 485 45 32 17 16 16 16 2 8 65 106 83 1 2 1 1 1 80 70 11 2 2 11 65 26 1 12 1 1 7 25 9 3 25 27 3 968 24 21 3 529 12 508 520 520 520 510 10 48 39 9 43 1 33 10 29 13 27 16 38 4 39 3 40 2 38 4 34 8 32 10 40 2 40 40 33 7 10 29 24 8 9 1 5 7 4 6 1 5 696 749 3 7 5 1 2 4 2 7 2 4 5 739 30 965 420 420 418 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { /* for default tnl0 device allow to change only the proto */ t->parms.proto = p->proto; netdev_state_change(t->dev); } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; struct net *net; int err; net = params->link_net ? : dev_net(dev); ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
128 128 107 107 1 1 153 70 31 31 152 26 26 25 469 448 21 73 68 5 475 141 29 29 29 29 29 1 1 107 10 117 117 494 356 72 117 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. */ #include <rdma/rdma_cm.h> #include <rdma/ib_verbs.h> #include <rdma/restrack.h> #include <rdma/rdma_counter.h> #include <linux/mutex.h> #include <linux/sched/task.h> #include <linux/pid_namespace.h> #include "cma_priv.h" #include "restrack.h" /** * rdma_restrack_init() - initialize and allocate resource tracking * @dev: IB device * * Return: 0 on success */ int rdma_restrack_init(struct ib_device *dev) { struct rdma_restrack_root *rt; int i; dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL); if (!dev->res) return -ENOMEM; rt = dev->res; for (i = 0; i < RDMA_RESTRACK_MAX; i++) xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC); return 0; } /** * rdma_restrack_clean() - clean resource tracking * @dev: IB device */ void rdma_restrack_clean(struct ib_device *dev) { struct rdma_restrack_root *rt = dev->res; int i; for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { struct xarray *xa = &dev->res[i].xa; WARN_ON(!xa_empty(xa)); xa_destroy(xa); } kfree(rt); } /** * rdma_restrack_count() - the current usage of specific object * @dev: IB device * @type: actual type of object to operate * @show_details: count driver specific objects */ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, bool show_details) { struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; XA_STATE(xas, &rt->xa, 0); u32 cnt = 0; xa_lock(&rt->xa); xas_for_each(&xas, e, U32_MAX) { if (xa_get_mark(&rt->xa, e->id, RESTRACK_DD) && !show_details) continue; cnt++; } xa_unlock(&rt->xa); return cnt; } EXPORT_SYMBOL(rdma_restrack_count); static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) { switch (res->type) { case RDMA_RESTRACK_PD: return container_of(res, struct ib_pd, res)->device; case RDMA_RESTRACK_CQ: return container_of(res, struct ib_cq, res)->device; case RDMA_RESTRACK_QP: return container_of(res, struct ib_qp, res)->device; case RDMA_RESTRACK_CM_ID: return container_of(res, struct rdma_id_private, res)->id.device; case RDMA_RESTRACK_MR: return container_of(res, struct ib_mr, res)->device; case RDMA_RESTRACK_CTX: return container_of(res, struct ib_ucontext, res)->device; case RDMA_RESTRACK_COUNTER: return container_of(res, struct rdma_counter, res)->device; case RDMA_RESTRACK_SRQ: return container_of(res, struct ib_srq, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; } } /** * rdma_restrack_attach_task() - attach the task onto this resource, * valid for user space restrack entries. * @res: resource entry * @task: the task to attach */ static void rdma_restrack_attach_task(struct rdma_restrack_entry *res, struct task_struct *task) { if (WARN_ON_ONCE(!task)) return; if (res->task) put_task_struct(res->task); get_task_struct(task); res->task = task; res->user = true; } /** * rdma_restrack_set_name() - set the task for this resource * @res: resource entry * @caller: kernel name, the current task will be used if the caller is NULL. */ void rdma_restrack_set_name(struct rdma_restrack_entry *res, const char *caller) { if (caller) { res->kern_name = caller; return; } rdma_restrack_attach_task(res, current); } EXPORT_SYMBOL(rdma_restrack_set_name); /** * rdma_restrack_parent_name() - set the restrack name properties based * on parent restrack * @dst: destination resource entry * @parent: parent resource entry */ void rdma_restrack_parent_name(struct rdma_restrack_entry *dst, const struct rdma_restrack_entry *parent) { if (rdma_is_kernel_res(parent)) dst->kern_name = parent->kern_name; else rdma_restrack_attach_task(dst, parent->task); } EXPORT_SYMBOL(rdma_restrack_parent_name); /** * rdma_restrack_new() - Initializes new restrack entry to allow _put() interface * to release memory in fully automatic way. * @res: Entry to initialize * @type: REstrack type */ void rdma_restrack_new(struct rdma_restrack_entry *res, enum rdma_restrack_type type) { kref_init(&res->kref); init_completion(&res->comp); res->type = type; } EXPORT_SYMBOL(rdma_restrack_new); /** * rdma_restrack_add() - add object to the reource tracking database * @res: resource entry */ void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); struct rdma_restrack_root *rt; int ret = 0; if (!dev) return; if (res->no_track) goto out; rt = &dev->res[res->type]; if (res->type == RDMA_RESTRACK_QP) { /* Special case to ensure that LQPN points to right QP */ struct ib_qp *qp = container_of(res, struct ib_qp, res); WARN_ONCE(qp->qp_num >> 24 || qp->port >> 8, "QP number 0x%0X and port 0x%0X", qp->qp_num, qp->port); res->id = qp->qp_num; if (qp->qp_type == IB_QPT_SMI || qp->qp_type == IB_QPT_GSI) res->id |= qp->port << 24; ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL); if (ret) res->id = 0; if (qp->qp_type >= IB_QPT_DRIVER) xa_set_mark(&rt->xa, res->id, RESTRACK_DD); } else if (res->type == RDMA_RESTRACK_COUNTER) { /* Special case to ensure that cntn points to right counter */ struct rdma_counter *counter; counter = container_of(res, struct rdma_counter, res); ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL); res->id = ret ? 0 : counter->id; } else { ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, &rt->next_id, GFP_KERNEL); ret = (ret < 0) ? ret : 0; } out: if (!ret) res->valid = true; } EXPORT_SYMBOL(rdma_restrack_add); int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) { return kref_get_unless_zero(&res->kref); } EXPORT_SYMBOL(rdma_restrack_get); /** * rdma_restrack_get_byid() - translate from ID to restrack object * @dev: IB device * @type: resource track type * @id: ID to take a look * * Return: Pointer to restrack entry or -ENOENT in case of error. */ struct rdma_restrack_entry * rdma_restrack_get_byid(struct ib_device *dev, enum rdma_restrack_type type, u32 id) { struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *res; xa_lock(&rt->xa); res = xa_load(&rt->xa, id); if (!res || !rdma_restrack_get(res)) res = ERR_PTR(-ENOENT); xa_unlock(&rt->xa); return res; } EXPORT_SYMBOL(rdma_restrack_get_byid); static void restrack_release(struct kref *kref) { struct rdma_restrack_entry *res; res = container_of(kref, struct rdma_restrack_entry, kref); if (res->task) { put_task_struct(res->task); res->task = NULL; } complete(&res->comp); } int rdma_restrack_put(struct rdma_restrack_entry *res) { return kref_put(&res->kref, restrack_release); } EXPORT_SYMBOL(rdma_restrack_put); /** * rdma_restrack_del() - delete object from the reource tracking database * @res: resource entry */ void rdma_restrack_del(struct rdma_restrack_entry *res) { struct rdma_restrack_entry *old; struct rdma_restrack_root *rt; struct ib_device *dev; if (!res->valid) { if (res->task) { put_task_struct(res->task); res->task = NULL; } return; } if (res->no_track) goto out; dev = res_to_dev(res); if (WARN_ON(!dev)) return; rt = &dev->res[res->type]; old = xa_erase(&rt->xa, res->id); WARN_ON(old != res); out: res->valid = false; rdma_restrack_put(res); wait_for_completion(&res->comp); } EXPORT_SYMBOL(rdma_restrack_del);
96 8080 9478 6046 96 96 96 96 11129 9642 11003 5777 2184 5671 287 288 96 96 292 95 96 95 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Implementation of the access vector table type. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* Updated: Frank Mayer <mayerf@tresys.com> and * Karl MacMillan <kmacmillan@tresys.com> * Added conditional policy language extensions * Copyright (C) 2003 Tresys Technology, LLC * * Updated: Yuichi Nakamura <ynakam@hitachisoft.jp> * Tuned number of hash slots for avtab to reduce memory usage */ #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include "avtab.h" #include "policydb.h" static struct kmem_cache *avtab_node_cachep __ro_after_init; static struct kmem_cache *avtab_xperms_cachep __ro_after_init; /* Based on MurmurHash3, written by Austin Appleby and placed in the * public domain. */ static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask) { static const u32 c1 = 0xcc9e2d51; static const u32 c2 = 0x1b873593; static const u32 r1 = 15; static const u32 r2 = 13; static const u32 m = 5; static const u32 n = 0xe6546b64; u32 hash = 0; #define mix(input) \ do { \ u32 v = input; \ v *= c1; \ v = (v << r1) | (v >> (32 - r1)); \ v *= c2; \ hash ^= v; \ hash = (hash << r2) | (hash >> (32 - r2)); \ hash = hash * m + n; \ } while (0) mix(keyp->target_class); mix(keyp->target_type); mix(keyp->source_type); #undef mix hash ^= hash >> 16; hash *= 0x85ebca6b; hash ^= hash >> 13; hash *= 0xc2b2ae35; hash ^= hash >> 16; return hash & mask; } static struct avtab_node *avtab_insert_node(struct avtab *h, struct avtab_node **dst, const struct avtab_key *key, const struct avtab_datum *datum) { struct avtab_node *newnode; struct avtab_extended_perms *xperms; newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL); if (newnode == NULL) return NULL; newnode->key = *key; if (key->specified & AVTAB_XPERMS) { xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL); if (xperms == NULL) { kmem_cache_free(avtab_node_cachep, newnode); return NULL; } *xperms = *(datum->u.xperms); newnode->datum.u.xperms = xperms; } else { newnode->datum.u.data = datum->u.data; } newnode->next = *dst; *dst = newnode; h->nel++; return newnode; } static int avtab_node_cmp(const struct avtab_key *key1, const struct avtab_key *key2) { u16 specified = key1->specified & ~(AVTAB_ENABLED | AVTAB_ENABLED_OLD); if (key1->source_type == key2->source_type && key1->target_type == key2->target_type && key1->target_class == key2->target_class && (specified & key2->specified)) return 0; if (key1->source_type < key2->source_type) return -1; if (key1->source_type == key2->source_type && key1->target_type < key2->target_type) return -1; if (key1->source_type == key2->source_type && key1->target_type == key2->target_type && key1->target_class < key2->target_class) return -1; return 1; } static int avtab_insert(struct avtab *h, const struct avtab_key *key, const struct avtab_datum *datum) { u32 hvalue; struct avtab_node *prev, *cur, *newnode; int cmp; if (!h || !h->nslot || h->nel == U32_MAX) return -EINVAL; hvalue = avtab_hash(key, h->mask); for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { cmp = avtab_node_cmp(key, &cur->key); /* extended perms may not be unique */ if (cmp == 0 && !(key->specified & AVTAB_XPERMS)) return -EEXIST; if (cmp <= 0) break; } newnode = avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue], key, datum); if (!newnode) return -ENOMEM; return 0; } /* Unlike avtab_insert(), this function allow multiple insertions of the same * key/specified mask into the table, as needed by the conditional avtab. * It also returns a pointer to the node inserted. */ struct avtab_node *avtab_insert_nonunique(struct avtab *h, const struct avtab_key *key, const struct avtab_datum *datum) { u32 hvalue; struct avtab_node *prev, *cur; int cmp; if (!h || !h->nslot || h->nel == U32_MAX) return NULL; hvalue = avtab_hash(key, h->mask); for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { cmp = avtab_node_cmp(key, &cur->key); if (cmp <= 0) break; } return avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue], key, datum); } /* This search function returns a node pointer, and can be used in * conjunction with avtab_search_next_node() */ struct avtab_node *avtab_search_node(struct avtab *h, const struct avtab_key *key) { u32 hvalue; struct avtab_node *cur; int cmp; if (!h || !h->nslot) return NULL; hvalue = avtab_hash(key, h->mask); for (cur = h->htable[hvalue]; cur; cur = cur->next) { cmp = avtab_node_cmp(key, &cur->key); if (cmp == 0) return cur; if (cmp < 0) break; } return NULL; } struct avtab_node *avtab_search_node_next(struct avtab_node *node, u16 specified) { struct avtab_key tmp_key; struct avtab_node *cur; int cmp; if (!node) return NULL; tmp_key = node->key; tmp_key.specified = specified; for (cur = node->next; cur; cur = cur->next) { cmp = avtab_node_cmp(&tmp_key, &cur->key); if (cmp == 0) return cur; if (cmp < 0) break; } return NULL; } void avtab_destroy(struct avtab *h) { u32 i; struct avtab_node *cur, *temp; if (!h) return; for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; while (cur) { temp = cur; cur = cur->next; if (temp->key.specified & AVTAB_XPERMS) kmem_cache_free(avtab_xperms_cachep, temp->datum.u.xperms); kmem_cache_free(avtab_node_cachep, temp); } } kvfree(h->htable); h->htable = NULL; h->nel = 0; h->nslot = 0; h->mask = 0; } void avtab_init(struct avtab *h) { h->htable = NULL; h->nel = 0; h->nslot = 0; h->mask = 0; } static int avtab_alloc_common(struct avtab *h, u32 nslot) { if (!nslot) return 0; h->htable = kvcalloc(nslot, sizeof(void *), GFP_KERNEL); if (!h->htable) return -ENOMEM; h->nslot = nslot; h->mask = nslot - 1; return 0; } int avtab_alloc(struct avtab *h, u32 nrules) { int rc; u32 nslot = 0; if (nrules != 0) { nslot = nrules > 3 ? rounddown_pow_of_two(nrules / 2) : 2; if (nslot > MAX_AVTAB_HASH_BUCKETS) nslot = MAX_AVTAB_HASH_BUCKETS; rc = avtab_alloc_common(h, nslot); if (rc) return rc; } pr_debug("SELinux: %d avtab hash slots, %d rules.\n", nslot, nrules); return 0; } int avtab_alloc_dup(struct avtab *new, const struct avtab *orig) { return avtab_alloc_common(new, orig->nslot); } #ifdef CONFIG_SECURITY_SELINUX_DEBUG void avtab_hash_eval(struct avtab *h, const char *tag) { u32 i, chain_len, slots_used, max_chain_len; unsigned long long chain2_len_sum; struct avtab_node *cur; slots_used = 0; max_chain_len = 0; chain2_len_sum = 0; for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; if (cur) { slots_used++; chain_len = 0; while (cur) { chain_len++; cur = cur->next; } if (chain_len > max_chain_len) max_chain_len = chain_len; chain2_len_sum += (unsigned long long)chain_len * chain_len; } } pr_debug("SELinux: %s: %d entries and %d/%d buckets used, " "longest chain length %d, sum of chain length^2 %llu\n", tag, h->nel, slots_used, h->nslot, max_chain_len, chain2_len_sum); } #endif /* CONFIG_SECURITY_SELINUX_DEBUG */ /* clang-format off */ static const uint16_t spec_order[] = { AVTAB_ALLOWED, AVTAB_AUDITDENY, AVTAB_AUDITALLOW, AVTAB_TRANSITION, AVTAB_CHANGE, AVTAB_MEMBER, AVTAB_XPERMS_ALLOWED, AVTAB_XPERMS_AUDITALLOW, AVTAB_XPERMS_DONTAUDIT }; /* clang-format on */ int avtab_read_item(struct avtab *a, struct policy_file *fp, struct policydb *pol, int (*insertf)(struct avtab *a, const struct avtab_key *k, const struct avtab_datum *d, void *p), void *p, bool conditional) { __le16 buf16[4]; u16 enabled; u32 items, items2, val, i; struct avtab_key key; struct avtab_datum datum; struct avtab_extended_perms xperms; __le32 buf32[ARRAY_SIZE(xperms.perms.p)]; int rc; unsigned int set, vers = pol->policyvers; memset(&key, 0, sizeof(struct avtab_key)); memset(&datum, 0, sizeof(struct avtab_datum)); if (vers < POLICYDB_VERSION_AVTAB) { rc = next_entry(buf32, fp, sizeof(u32)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items2 = le32_to_cpu(buf32[0]); if (items2 > ARRAY_SIZE(buf32)) { pr_err("SELinux: avtab: entry overflow\n"); return -EINVAL; } rc = next_entry(buf32, fp, sizeof(u32) * items2); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items = 0; val = le32_to_cpu(buf32[items++]); key.source_type = (u16)val; if (key.source_type != val) { pr_err("SELinux: avtab: truncated source type\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); key.target_type = (u16)val; if (key.target_type != val) { pr_err("SELinux: avtab: truncated target type\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); key.target_class = (u16)val; if (key.target_class != val) { pr_err("SELinux: avtab: truncated target class\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); enabled = (val & AVTAB_ENABLED_OLD) ? AVTAB_ENABLED : 0; if (!(val & (AVTAB_AV | AVTAB_TYPE))) { pr_err("SELinux: avtab: null entry\n"); return -EINVAL; } if ((val & AVTAB_AV) && (val & AVTAB_TYPE)) { pr_err("SELinux: avtab: entry has both access vectors and types\n"); return -EINVAL; } if (val & AVTAB_XPERMS) { pr_err("SELinux: avtab: entry has extended permissions\n"); return -EINVAL; } for (i = 0; i < ARRAY_SIZE(spec_order); i++) { if (val & spec_order[i]) { key.specified = spec_order[i] | enabled; datum.u.data = le32_to_cpu(buf32[items++]); rc = insertf(a, &key, &datum, p); if (rc) return rc; } } if (items != items2) { pr_err("SELinux: avtab: entry only had %d items, expected %d\n", items2, items); return -EINVAL; } return 0; } rc = next_entry(buf16, fp, sizeof(u16) * 4); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items = 0; key.source_type = le16_to_cpu(buf16[items++]); key.target_type = le16_to_cpu(buf16[items++]); key.target_class = le16_to_cpu(buf16[items++]); key.specified = le16_to_cpu(buf16[items++]); if (!policydb_type_isvalid(pol, key.source_type) || !policydb_type_isvalid(pol, key.target_type) || !policydb_class_isvalid(pol, key.target_class)) { pr_err("SELinux: avtab: invalid type or class\n"); return -EINVAL; } set = hweight16(key.specified & (AVTAB_XPERMS | AVTAB_TYPE | AVTAB_AV)); if (!set || set > 1) { pr_err("SELinux: avtab: more than one specifier\n"); return -EINVAL; } if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) && (key.specified & AVTAB_XPERMS)) { pr_err("SELinux: avtab: policy version %u does not " "support extended permissions rules and one " "was specified\n", vers); return -EINVAL; } else if ((vers < POLICYDB_VERSION_COND_XPERMS) && (key.specified & AVTAB_XPERMS) && conditional) { pr_err("SELinux: avtab: policy version %u does not " "support extended permissions rules in conditional " "policies and one was specified\n", vers); return -EINVAL; } else if (key.specified & AVTAB_XPERMS) { memset(&xperms, 0, sizeof(struct avtab_extended_perms)); rc = next_entry(&xperms.specified, fp, sizeof(u8)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } rc = next_entry(&xperms.driver, fp, sizeof(u8)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } rc = next_entry(buf32, fp, sizeof(u32) * ARRAY_SIZE(xperms.perms.p)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++) xperms.perms.p[i] = le32_to_cpu(buf32[i]); datum.u.xperms = &xperms; } else { rc = next_entry(buf32, fp, sizeof(u32)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } datum.u.data = le32_to_cpu(*buf32); } if ((key.specified & AVTAB_TYPE) && !policydb_type_isvalid(pol, datum.u.data)) { pr_err("SELinux: avtab: invalid type\n"); return -EINVAL; } return insertf(a, &key, &datum, p); } static int avtab_insertf(struct avtab *a, const struct avtab_key *k, const struct avtab_datum *d, void *p) { return avtab_insert(a, k, d); } int avtab_read(struct avtab *a, struct policy_file *fp, struct policydb *pol) { int rc; __le32 buf[1]; u32 nel, i; rc = next_entry(buf, fp, sizeof(u32)); if (rc < 0) { pr_err("SELinux: avtab: truncated table\n"); goto bad; } nel = le32_to_cpu(buf[0]); if (!nel) { pr_err("SELinux: avtab: table is empty\n"); rc = -EINVAL; goto bad; } rc = avtab_alloc(a, nel); if (rc) goto bad; for (i = 0; i < nel; i++) { rc = avtab_read_item(a, fp, pol, avtab_insertf, NULL, false); if (rc) { if (rc == -ENOMEM) pr_err("SELinux: avtab: out of memory\n"); else if (rc == -EEXIST) pr_err("SELinux: avtab: duplicate entry\n"); goto bad; } } rc = 0; out: return rc; bad: avtab_destroy(a); goto out; } int avtab_write_item(struct policydb *p, const struct avtab_node *cur, struct policy_file *fp) { __le16 buf16[4]; __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)]; int rc; unsigned int i; buf16[0] = cpu_to_le16(cur->key.source_type); buf16[1] = cpu_to_le16(cur->key.target_type); buf16[2] = cpu_to_le16(cur->key.target_class); buf16[3] = cpu_to_le16(cur->key.specified); rc = put_entry(buf16, sizeof(u16), 4, fp); if (rc) return rc; if (cur->key.specified & AVTAB_XPERMS) { rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1, fp); if (rc) return rc; rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp); if (rc) return rc; for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++) buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]); rc = put_entry(buf32, sizeof(u32), ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp); } else { buf32[0] = cpu_to_le32(cur->datum.u.data); rc = put_entry(buf32, sizeof(u32), 1, fp); } if (rc) return rc; return 0; } int avtab_write(struct policydb *p, struct avtab *a, struct policy_file *fp) { u32 i; int rc = 0; struct avtab_node *cur; __le32 buf[1]; buf[0] = cpu_to_le32(a->nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < a->nslot; i++) { for (cur = a->htable[i]; cur; cur = cur->next) { rc = avtab_write_item(p, cur, fp); if (rc) return rc; } } return rc; } void __init avtab_cache_init(void) { avtab_node_cachep = KMEM_CACHE(avtab_node, SLAB_PANIC); avtab_xperms_cachep = KMEM_CACHE(avtab_extended_perms, SLAB_PANIC); }
20 20 20 20 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 /* * llc_c_ac.c - actions performed during connection state transition. * * Description: * Functions in this module are implementation of connection component actions * Details of actions can be found in IEEE-802.2 standard document. * All functions have one connection and one event as input argument. All of * them return 0 On success and 1 otherwise. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <net/llc_conn.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/llc_c_ev.h> #include <net/llc_c_ac.h> #include <net/llc_c_st.h> #include <net/llc_pdu.h> #include <net/llc.h> static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb); static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb); static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *ev); static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb); static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, struct sk_buff *skb); static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb); #define INCORRECT 0 int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (llc->remote_busy_flag) { u8 nr; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); llc->remote_busy_flag = 0; timer_delete(&llc->busy_state_timer.timer); nr = LLC_I_GET_NR(pdu); llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); } return 0; } int llc_conn_ac_conn_ind(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->ind_prim = LLC_CONN_PRIM; return 0; } int llc_conn_ac_conn_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->cfm_prim = LLC_CONN_PRIM; return 0; } static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->cfm_prim = LLC_DATA_PRIM; return 0; } int llc_conn_ac_data_ind(struct sock *sk, struct sk_buff *skb) { llc_conn_rtn_pdu(sk, skb); return 0; } int llc_conn_ac_disc_ind(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); u8 reason = 0; int rc = 0; if (ev->type == LLC_CONN_EV_TYPE_PDU) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM) reason = LLC_DISC_REASON_RX_DM_RSP_PDU; else if (LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC) reason = LLC_DISC_REASON_RX_DISC_CMD_PDU; } else if (ev->type == LLC_CONN_EV_TYPE_ACK_TMR) reason = LLC_DISC_REASON_ACK_TMR_EXP; else rc = -EINVAL; if (!rc) { ev->reason = reason; ev->ind_prim = LLC_DISC_PRIM; } return rc; } int llc_conn_ac_disc_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->reason = ev->status; ev->cfm_prim = LLC_DISC_PRIM; return 0; } int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb) { u8 reason = 0; int rc = 1; struct llc_conn_state_ev *ev = llc_conn_ev(skb); struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); struct llc_sock *llc = llc_sk(sk); switch (ev->type) { case LLC_CONN_EV_TYPE_PDU: if (LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR) { reason = LLC_RESET_REASON_LOCAL; rc = 0; } else if (LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME) { reason = LLC_RESET_REASON_REMOTE; rc = 0; } break; case LLC_CONN_EV_TYPE_ACK_TMR: case LLC_CONN_EV_TYPE_P_TMR: case LLC_CONN_EV_TYPE_REJ_TMR: case LLC_CONN_EV_TYPE_BUSY_TMR: if (llc->retry_count > llc->n2) { reason = LLC_RESET_REASON_LOCAL; rc = 0; } break; } if (!rc) { ev->reason = reason; ev->ind_prim = LLC_RESET_PRIM; } return rc; } int llc_conn_ac_rst_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->reason = 0; ev->cfm_prim = LLC_RESET_PRIM; return 0; } int llc_conn_ac_clear_remote_busy_if_f_eq_1(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && llc_sk(sk)->ack_pf) llc_conn_ac_clear_remote_busy(sk, skb); return 0; } int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (llc->data_flag == 2) timer_delete(&llc->rej_sent_timer.timer); return 0; } int llc_conn_ac_send_disc_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_disc_cmd(nskb, 1); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); llc_conn_ac_set_p_flag_1(sk, skb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_dm_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_dm_rsp(nskb, f_bit); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_dm_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_dm_rsp(nskb, 1); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_frmr_rsp_f_set_x(struct sock *sk, struct sk_buff *skb) { u8 f_bit; int rc = -ENOBUFS; struct sk_buff *nskb; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); struct llc_sock *llc = llc_sk(sk); llc->rx_pdu_hdr = *((u32 *)pdu); if (LLC_PDU_IS_CMD(pdu)) llc_pdu_decode_pf_bit(skb, &f_bit); else f_bit = 0; nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, sizeof(struct llc_frmr_info)); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, llc->vR, INCORRECT); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_resend_frmr_rsp_f_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, sizeof(struct llc_frmr_info)); if (nskb) { struct llc_sap *sap = llc->sap; struct llc_pdu_sn *pdu = (struct llc_pdu_sn *)&llc->rx_pdu_hdr; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_frmr_rsp(nskb, pdu, 0, llc->vS, llc->vR, INCORRECT); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_resend_frmr_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) { u8 f_bit; int rc = -ENOBUFS; struct sk_buff *nskb; struct llc_sock *llc = llc_sk(sk); llc_pdu_decode_pf_bit(skb, &f_bit); nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, sizeof(struct llc_frmr_info)); if (nskb) { struct llc_sap *sap = llc->sap; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, llc->vR, INCORRECT); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; } static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; } int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return 0; } int llc_conn_ac_resend_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); u8 nr = LLC_I_GET_NR(pdu); llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); return 0; } int llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr(struct sock *sk, struct sk_buff *skb) { u8 nr; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) llc_conn_send_pdu(sk, nskb); else kfree_skb(skb); } if (rc) { nr = LLC_I_GET_NR(pdu); rc = 0; llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); } return rc; } int llc_conn_ac_resend_i_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); u8 nr = LLC_I_GET_NR(pdu); llc_conn_resend_i_pdu_as_rsp(sk, nr, 1); return 0; } int llc_conn_ac_send_rej_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_rej_cmd(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rej_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rej_rsp(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rej_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rej_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rnr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_rnr_cmd(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rnr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rnr_rsp(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!llc->remote_busy_flag) { llc->remote_busy_flag = 1; mod_timer(&llc->busy_state_timer.timer, jiffies + llc->busy_state_timer.expire); } return 0; } int llc_conn_ac_opt_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_rr_cmd(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; u8 f_bit = 1; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_ack_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_ack_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } void llc_conn_set_p_flag(struct sock *sk, u8 value) { int state_changed = llc_sk(sk)->p_flag && !value; llc_sk(sk)->p_flag = value; if (state_changed) sk->sk_state_change(sk); } int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; const u8 *dmac = llc->daddr.mac; if (llc->dev->flags & IFF_LOOPBACK) dmac = llc->dev->dev_addr; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_sabme_cmd(nskb, 1); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, dmac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); llc_conn_set_p_flag(sk, 1); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) { u8 f_bit; int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); llc_pdu_decode_pf_bit(skb, &f_bit); if (nskb) { struct llc_sap *sap = llc->sap; nskb->dev = llc->dev; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_ua_rsp(nskb, f_bit); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_set_s_flag_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->s_flag = 0; return 0; } int llc_conn_ac_set_s_flag_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->s_flag = 1; return 0; } int llc_conn_ac_start_p_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); llc_conn_set_p_flag(sk, 1); mod_timer(&llc->pf_cycle_timer.timer, jiffies + llc->pf_cycle_timer.expire); return 0; } /** * llc_conn_ac_send_ack_if_needed - check if ack is needed * @sk: current connection structure * @skb: current event * * Checks number of received PDUs which have not been acknowledged, yet, * If number of them reaches to "npta"(Number of PDUs To Acknowledge) then * sends an RR response as acknowledgement for them. Returns 0 for * success, 1 otherwise. */ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) { u8 pf_bit; struct llc_sock *llc = llc_sk(sk); llc_pdu_decode_pf_bit(skb, &pf_bit); llc->ack_pf |= pf_bit & 1; if (!llc->ack_must_be_send) { llc->first_pdu_Ns = llc->vR; llc->ack_must_be_send = 1; llc->ack_pf = pf_bit & 1; } if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO) % LLC_2_SEQ_NBR_MODULO) >= llc->npta) { llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0; llc->ack_pf = 0; llc_conn_ac_inc_npta_value(sk, skb); } return 0; } /** * llc_conn_ac_rst_sendack_flag - resets ack_must_be_send flag * @sk: current connection structure * @skb: current event * * This action resets ack_must_be_send flag of given connection, this flag * indicates if there is any PDU which has not been acknowledged yet. * Returns 0 for success, 1 otherwise. */ int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->ack_must_be_send = llc_sk(sk)->ack_pf = 0; return 0; } /** * llc_conn_ac_send_i_rsp_f_set_ackpf - acknowledge received PDUs * @sk: current connection structure * @skb: current event * * Sends an I response PDU with f-bit set to ack_pf flag as acknowledge to * all received PDUs which have not been acknowledged, yet. ack_pf flag is * set to one if one PDU with p-bit set to one is received. Returns 0 for * success, 1 otherwise. */ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; } /** * llc_conn_ac_send_i_as_ack - sends an I-format PDU to acknowledge rx PDUs * @sk: current connection structure. * @skb: current event. * * This action sends an I-format PDU as acknowledge to received PDUs which * have not been acknowledged, yet, if there is any. By using of this * action number of acknowledgements decreases, this technic is called * piggy backing. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); int ret; if (llc->ack_must_be_send) { ret = llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0 ; llc->ack_pf = 0; } else { ret = llc_conn_ac_send_i_cmd_p_set_0(sk, skb); } return ret; } /** * llc_conn_ac_send_rr_rsp_f_set_ackpf - ack all rx PDUs not yet acked * @sk: current connection structure. * @skb: current event. * * This action sends an RR response with f-bit set to ack_pf flag as * acknowledge to all received PDUs which have not been acknowledged, yet, * if there is any. ack_pf flag indicates if a PDU has been received with * p-bit set to one. Returns 0 for success, 1 otherwise. */ static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, llc->ack_pf, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } /** * llc_conn_ac_inc_npta_value - tries to make value of npta greater * @sk: current connection structure. * @skb: current event. * * After "inc_cntr" times calling of this action, "npta" increase by one. * this action tries to make vale of "npta" greater as possible; number of * acknowledgements decreases by increasing of "npta". Returns 0 for * success, 1 otherwise. */ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!llc->inc_cntr) { llc->dec_step = 0; llc->dec_cntr = llc->inc_cntr = 2; ++llc->npta; if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO) llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO; } else --llc->inc_cntr; return 0; } /** * llc_conn_ac_adjust_npta_by_rr - decreases "npta" by one * @sk: current connection structure. * @skb: current event. * * After receiving "dec_cntr" times RR command, this action decreases * "npta" by one. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!llc->connect_step && !llc->remote_busy_flag) { if (!llc->dec_step) { if (!llc->dec_cntr) { llc->inc_cntr = llc->dec_cntr = 2; if (llc->npta > 0) llc->npta = llc->npta - 1; } else llc->dec_cntr -=1; } } else llc->connect_step = 0 ; return 0; } /** * llc_conn_ac_adjust_npta_by_rnr - decreases "npta" by one * @sk: current connection structure. * @skb: current event. * * After receiving "dec_cntr" times RNR command, this action decreases * "npta" by one. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (llc->remote_busy_flag) if (!llc->dec_step) { if (!llc->dec_cntr) { llc->inc_cntr = llc->dec_cntr = 2; if (llc->npta > 0) --llc->npta; } else --llc->dec_cntr; } return 0; } /** * llc_conn_ac_dec_tx_win_size - decreases tx window size * @sk: current connection structure. * @skb: current event. * * After receiving of a REJ command or response, transmit window size is * decreased by number of PDUs which are outstanding yet. Returns 0 for * success, 1 otherwise. */ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); if (llc->k - unacked_pdu < 1) llc->k = 1; else llc->k -= unacked_pdu; return 0; } /** * llc_conn_ac_inc_tx_win_size - tx window size is inc by 1 * @sk: current connection structure. * @skb: current event. * * After receiving an RR response with f-bit set to one, transmit window * size is increased by one. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); llc->k += 1; if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO) llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO; return 0; } int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb) { llc_sk_stop_all_timers(sk, false); return 0; } int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); timer_delete(&llc->rej_sent_timer.timer); timer_delete(&llc->pf_cycle_timer.timer); timer_delete(&llc->busy_state_timer.timer); llc->ack_must_be_send = 0; llc->ack_pf = 0; return 0; } int llc_conn_ac_start_ack_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire); return 0; } int llc_conn_ac_start_rej_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); mod_timer(&llc->rej_sent_timer.timer, jiffies + llc->rej_sent_timer.expire); return 0; } int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!timer_pending(&llc->ack_timer.timer)) mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire); return 0; } int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb) { timer_delete(&llc_sk(sk)->ack_timer.timer); return 0; } int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); timer_delete(&llc->pf_cycle_timer.timer); llc_conn_set_p_flag(sk, 0); return 0; } int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb) { timer_delete(&llc_sk(sk)->rej_sent_timer.timer); return 0; } int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb) { int acked; u16 unacked = 0; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); struct llc_sock *llc = llc_sk(sk); llc->last_nr = PDU_SUPV_GET_Nr(pdu); acked = llc_conn_remove_acked_pdus(sk, llc->last_nr, &unacked); /* On loopback we don't queue I frames in unack_pdu_q queue. */ if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) { llc->retry_count = 0; timer_delete(&llc->ack_timer.timer); if (llc->failed_data_req) { /* already, we did not accept data from upper layer * (tx_window full or unacceptable state). Now, we * can send data and must inform to upper layer. */ llc->failed_data_req = 0; llc_conn_ac_data_confirm(sk, skb); } if (unacked) mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire); } else if (llc->failed_data_req) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); if (f_bit == 1) { llc->failed_data_req = 0; llc_conn_ac_data_confirm(sk, skb); } } return 0; } int llc_conn_ac_upd_p_flag(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_RSP(pdu)) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); if (f_bit) { llc_conn_set_p_flag(sk, 0); llc_conn_ac_stop_p_timer(sk, skb); } } return 0; } int llc_conn_ac_set_data_flag_2(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->data_flag = 2; return 0; } int llc_conn_ac_set_data_flag_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->data_flag = 0; return 0; } int llc_conn_ac_set_data_flag_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->data_flag = 1; return 0; } int llc_conn_ac_set_data_flag_1_if_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) { if (!llc_sk(sk)->data_flag) llc_sk(sk)->data_flag = 1; return 0; } int llc_conn_ac_set_p_flag_0(struct sock *sk, struct sk_buff *skb) { llc_conn_set_p_flag(sk, 0); return 0; } static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb) { llc_conn_set_p_flag(sk, 1); return 0; } int llc_conn_ac_set_remote_busy_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->remote_busy_flag = 0; return 0; } int llc_conn_ac_set_cause_flag_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->cause_flag = 0; return 0; } int llc_conn_ac_set_cause_flag_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->cause_flag = 1; return 0; } int llc_conn_ac_set_retry_cnt_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->retry_count = 0; return 0; } int llc_conn_ac_inc_retry_cnt_by_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->retry_count++; return 0; } int llc_conn_ac_set_vr_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vR = 0; return 0; } int llc_conn_ac_inc_vr_by_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vR = PDU_GET_NEXT_Vr(llc_sk(sk)->vR); return 0; } int llc_conn_ac_set_vs_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vS = 0; return 0; } int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vS = llc_sk(sk)->last_nr; return 0; } static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO; return 0; } static void llc_conn_tmr_common_cb(struct sock *sk, u8 type) { struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); bh_lock_sock(sk); if (skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); skb_set_owner_r(skb, sk); ev->type = type; llc_process_tmr_ev(sk, skb); } bh_unlock_sock(sk); } void llc_conn_pf_cycle_tmr_cb(struct timer_list *t) { struct llc_sock *llc = timer_container_of(llc, t, pf_cycle_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_P_TMR); } void llc_conn_busy_tmr_cb(struct timer_list *t) { struct llc_sock *llc = timer_container_of(llc, t, busy_state_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_BUSY_TMR); } void llc_conn_ack_tmr_cb(struct timer_list *t) { struct llc_sock *llc = timer_container_of(llc, t, ack_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_ACK_TMR); } void llc_conn_rej_tmr_cb(struct timer_list *t) { struct llc_sock *llc = timer_container_of(llc, t, rej_sent_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_REJ_TMR); } int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->X = llc_sk(sk)->vS; llc_conn_ac_set_vs_nr(sk, skb); return 0; } int llc_conn_ac_upd_vs(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); u8 nr = PDU_SUPV_GET_Nr(pdu); if (llc_circular_between(llc_sk(sk)->vS, nr, llc_sk(sk)->X)) llc_conn_ac_set_vs_nr(sk, skb); return 0; } /* * Non-standard actions; these not contained in IEEE specification; for * our own usage */ /** * llc_conn_disc - removes connection from SAP list and frees it * @sk: closed connection * @skb: occurred event */ int llc_conn_disc(struct sock *sk, struct sk_buff *skb) { /* FIXME: this thing seems to want to die */ return 0; } /** * llc_conn_reset - resets connection * @sk : reseting connection. * @skb: occurred event. * * Stop all timers, empty all queues and reset all flags. */ int llc_conn_reset(struct sock *sk, struct sk_buff *skb) { llc_sk_reset(sk); return 0; } /** * llc_circular_between - designates that b is between a and c or not * @a: lower bound * @b: element to see if is between a and b * @c: upper bound * * This function designates that b is between a and c or not (for example, * 0 is between 127 and 1). Returns 1 if b is between a and c, 0 * otherwise. */ u8 llc_circular_between(u8 a, u8 b, u8 c) { b = b - a; c = c - a; return b <= c; } /** * llc_process_tmr_ev - timer backend * @sk: active connection * @skb: occurred event * * This function is called from timer callback functions. When connection * is busy (during sending a data frame) timer expiration event must be * queued. Otherwise this event can be sent to connection state machine. * Queued events will process by llc_backlog_rcv function after sending * data frame. */ static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb) { if (llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) { printk(KERN_WARNING "%s: timer called on closed connection\n", __func__); kfree_skb(skb); } else { if (!sock_owned_by_user(sk)) llc_conn_state_process(sk, skb); else { llc_set_backlog_type(skb, LLC_EVENT); __sk_add_backlog(sk, skb); } } }
9 9 5 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for logging packets. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <net/ipv6.h> #include <net/icmp.h> #include <net/udp.h> #include <net/tcp.h> #include <net/route.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_LOG.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/netfilter/nf_log.h> static unsigned int log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_log_info *loginfo = par->targinfo; struct net *net = xt_net(par); struct nf_loginfo li; li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par), xt_out(par), &li, "%s", loginfo->prefix); return XT_CONTINUE; } static int log_tg_check(const struct xt_tgchk_param *par) { const struct xt_log_info *loginfo = par->targinfo; int ret; if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) return -EINVAL; if (loginfo->level >= 8) { pr_debug("level %u >= 8\n", loginfo->level); return -EINVAL; } if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { pr_debug("prefix is not null-terminated\n"); return -EINVAL; } ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); if (ret != 0 && !par->nft_compat) { request_module("%s", "nf_log_syslog"); ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); } return ret; } static void log_tg_destroy(const struct xt_tgdtor_param *par) { nf_logger_put(par->family, NF_LOG_TYPE_LOG); } static struct xt_target log_tg_regs[] __read_mostly = { { .name = "LOG", .family = NFPROTO_IPV4, .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, .destroy = log_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "LOG", .family = NFPROTO_IPV6, .target = log_tg, .targetsize = sizeof(struct xt_log_info), .checkentry = log_tg_check, .destroy = log_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __init log_tg_init(void) { return xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } static void __exit log_tg_exit(void) { xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); } module_init(log_tg_init); module_exit(log_tg_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); MODULE_ALIAS("ipt_LOG"); MODULE_ALIAS("ip6t_LOG"); MODULE_SOFTDEP("pre: nf_log_syslog");
710 710 710 710 3082 3080 710 4017 4018 2449 2449 1411 180 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/busy_poll.h> #include <net/net_namespace.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> #include <net/page_pool/memory_provider.h> #include "dev.h" #include "devmem.h" #include "netdev-genl-gen.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; #define XDP_METADATA_KFUNC(_, flag, __, xmo) \ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ xdp_rx_meta |= flag; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC if (netdev->xsk_tx_metadata_ops) { if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, xsk_features, NETDEV_A_DEV_PAD)) goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, netdev->xdp_zc_max_segs)) goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); } int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_device *netdev; struct sk_buff *rsp; u32 ifindex; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_free_msg; } err = netdev_nl_dev_fill(netdev, rsp, info); netdev_unlock(netdev); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); int err; for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) return err; } return 0; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; pid_t pid; if (!napi->dev->up) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) goto nla_put_failure; } napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, napi_defer_hard_irqs)) goto nla_put_failure; irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, irq_suspend_timeout)) goto nla_put_failure; gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; struct sk_buff *rsp; u32 napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } if (err) { goto err_free_msg; } else if (!rsp->len) { err = -ENOENT; goto err_free_msg; } return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; unsigned int prev_id; int err = 0; if (!netdev->up) return err; prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { if (!napi_id_valid(napi->napi_id)) continue; /* Dump continuation below depends on the list being sorted */ WARN_ON_ONCE(napi->napi_id >= prev_id); prev_id = napi->napi_id; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; err = netdev_nl_napi_fill_one(rsp, napi, info); if (err) return err; ctx->napi_id = napi->napi_id; } return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock(net, ifindex); if (netdev) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); netdev_unlock(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } return err; } static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u32 defer = 0; if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); napi_set_defer_hard_irqs(napi, defer); } if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); } if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); } return 0; } int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; unsigned int napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } return err; } static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) { if (napi && napi_id_valid(napi->napi_id)) return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); return 0; } static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) goto nla_put_failure; #endif break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; #ifdef CONFIG_XDP_SOCKETS if (txq->pool) if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) goto nla_put_failure; #endif break; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, u32 q_type) { switch (q_type) { case NETDEV_QUEUE_TYPE_RX: if (q_id >= netdev->real_num_rx_queues) return -EINVAL; return 0; case NETDEV_QUEUE_TYPE_TX: if (q_id >= netdev->real_num_tx_queues) return -EINVAL; } return 0; } static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { int err; if (!netdev->up) return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) return err; return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); } int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 q_id, q_type, ifindex; struct net_device *netdev; struct sk_buff *rsp; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) return -EINVAL; q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), ifindex); if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { int err = 0; if (!netdev->up) return err; for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; } for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; } return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->rxq_idx = 0; ctx->txq_idx = 0; } } return err; } #define NETDEV_STAT_NOT_SET (~0ULL) static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) { const u64 *add = _add; u64 *sum = _sum; while (size) { if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) *sum += *add; sum++; add++; size -= 8; } } static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) { if (value == NETDEV_STAT_NOT_SET) return 0; return nla_put_uint(rsp, attr_id, value); } static int netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, u32 q_type, int i, const struct genl_info *info) { const struct netdev_stat_ops *ops = netdev->stat_ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: memset(&rx, 0xff, sizeof(rx)); ops->get_queue_stats_rx(netdev, i, &rx); if (!memchr_inv(&rx, 0xff, sizeof(rx))) goto nla_cancel; if (netdev_nl_stats_write_rx(rsp, &rx)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: memset(&tx, 0xff, sizeof(tx)); ops->get_queue_stats_tx(netdev, i, &tx); if (!memchr_inv(&tx, 0xff, sizeof(tx))) goto nla_cancel; if (netdev_nl_stats_write_tx(rsp, &tx)) goto nla_put_failure; break; } genlmsg_end(rsp, hdr); return 0; nla_cancel: genlmsg_cancel(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { const struct netdev_stat_ops *ops = netdev->stat_ops; int i, err; if (!(netdev->flags & IFF_UP)) return 0; i = ctx->rxq_idx; while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, i, info); if (err) return err; ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, i, info); if (err) return err; ctx->txq_idx = ++i; } ctx->rxq_idx = 0; ctx->txq_idx = 0; return 0; } /** * netdev_stat_queue_sum() - add up queue stats from range of queues * @netdev: net_device * @rx_start: index of the first Rx queue to query * @rx_end: index after the last Rx queue (first *not* to query) * @rx_sum: output Rx stats, should be already initialized * @tx_start: index of the first Tx queue to query * @tx_end: index after the last Tx queue (first *not* to query) * @tx_sum: output Tx stats, should be already initialized * * Add stats from [start, end) range of queue IDs to *x_sum structs. * The sum structs must be already initialized. Usually this * helper is invoked from the .get_base_stats callbacks of drivers * to account for stats of disabled queues. In that case the ranges * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues). */ void netdev_stat_queue_sum(struct net_device *netdev, int rx_start, int rx_end, struct netdev_queue_stats_rx *rx_sum, int tx_start, int tx_end, struct netdev_queue_stats_tx *tx_sum) { const struct netdev_stat_ops *ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; int i; ops = netdev->stat_ops; for (i = rx_start; i < rx_end; i++) { memset(&rx, 0xff, sizeof(rx)); if (ops->get_queue_stats_rx) ops->get_queue_stats_rx(netdev, i, &rx); netdev_nl_stats_add(rx_sum, &rx, sizeof(rx)); } for (i = tx_start; i < tx_end; i++) { memset(&tx, 0xff, sizeof(tx)); if (ops->get_queue_stats_tx) ops->get_queue_stats_tx(netdev, i, &tx); netdev_nl_stats_add(tx_sum, &tx, sizeof(tx)); } } EXPORT_SYMBOL(netdev_stat_queue_sum); static int netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { struct netdev_queue_stats_rx rx_sum; struct netdev_queue_stats_tx tx_sum; void *hdr; /* Netdev can't guarantee any complete counters */ if (!netdev->stat_ops->get_base_stats) return 0; memset(&rx_sum, 0xff, sizeof(rx_sum)); memset(&tx_sum, 0xff, sizeof(tx_sum)); netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum); /* The op was there, but nothing reported, don't bother */ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) goto nla_put_failure; netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum, 0, netdev->real_num_tx_queues, &tx_sum); if (netdev_nl_stats_write_rx(rsp, &rx_sum) || netdev_nl_stats_write_tx(rsp, &tx_sum)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, struct sk_buff *skb, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { if (!netdev->stat_ops) return 0; switch (scope) { case 0: return netdev_nl_stats_by_netdev(netdev, skb, info); case NETDEV_QSTATS_SCOPE_QUEUE: return netdev_nl_stats_by_queue(netdev, skb, info, ctx); } return -EINVAL; /* Should not happen, per netlink policy */ } int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; unsigned int ifindex; unsigned int scope; int err = 0; scope = 0; if (info->attrs[NETDEV_A_QSTATS_SCOPE]) scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); ifindex = 0; if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); return -ENODEV; } if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); err = -EOPNOTSUPP; } netdev_unlock_ops_compat(netdev); return err; } for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); if (err < 0) break; } return err; } int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; struct net_devmem_dmabuf_binding *binding; u32 ifindex, dmabuf_fd, rxq_idx; struct netdev_nl_sock *priv; struct net_device *netdev; struct sk_buff *rsp; struct nlattr *attr; int rem, err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } mutex_lock(&priv->lock); err = 0; netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_unlock_sock; } if (!netif_device_present(netdev)) err = -ENODEV; else if (!netdev_need_ops_lock(netdev)) err = -EOPNOTSUPP; if (err) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_DEV_IFINDEX]); goto err_unlock; } binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; } nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) { err = nla_parse_nested( tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, netdev_queue_id_nl_policy, info->extack); if (err < 0) goto err_unbind; if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { err = -EINVAL; goto err_unbind; } if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); err = -EINVAL; goto err_unbind; } rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) goto err_unbind; } nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); err = genlmsg_reply(rsp, info); if (err) goto err_unbind; netdev_unlock(netdev); mutex_unlock(&priv->lock); return 0; err_unbind: net_devmem_unbind_dmabuf(binding); err_unlock: netdev_unlock(netdev); err_unlock_sock: mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; } int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; struct netdev_nl_sock *priv; struct net_device *netdev; u32 ifindex, dmabuf_fd; struct sk_buff *rsp; int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } mutex_lock(&priv->lock); netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_unlock_sock; } if (!netif_device_present(netdev)) { err = -ENODEV; goto err_unlock_netdev; } if (!netdev->netmem_tx) { err = -EOPNOTSUPP; NL_SET_ERR_MSG(info->extack, "Driver does not support netmem TX"); goto err_unlock_netdev; } binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev; } nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); netdev_unlock(netdev); mutex_unlock(&priv->lock); return genlmsg_reply(rsp, info); err_unlock_netdev: netdev_unlock(netdev); err_unlock_sock: mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; } void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); mutex_init(&priv->lock); } void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; netdevice_tracker dev_tracker; struct net_device *dev; mutex_lock(&priv->lock); list_for_each_entry_safe(binding, temp, &priv->bindings, list) { mutex_lock(&binding->lock); dev = binding->dev; if (!dev) { mutex_unlock(&binding->lock); net_devmem_unbind_dmabuf(binding); continue; } netdev_hold(dev, &dev_tracker, GFP_KERNEL); mutex_unlock(&binding->lock); netdev_lock(dev); net_devmem_unbind_dmabuf(binding); netdev_unlock(dev); netdev_put(dev, &dev_tracker); } mutex_unlock(&priv->lock); } static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); break; } return NOTIFY_OK; } static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; static int __init netdev_genl_init(void) { int err; err = register_netdevice_notifier(&netdev_genl_nb); if (err) return err; err = genl_register_family(&netdev_nl_family); if (err) goto err_unreg_ntf; return 0; err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err; } subsys_initcall(netdev_genl_init);
44 11 9 2 9 6 2 2 2 6 27 27 5 2 2 1073 1074 2 1075 4017 4016 1075 803 803 802 4 502 396 108 108 3 4 108 10 3 3 3 9 6 4 2 2 8 6 2 2 1 27 12 18 18 2 2 4 13 12 9 2 1 9 1 2 2 6 8 10 5 13 1 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for queueing packets and communicating with * userspace via nfnetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2007 by Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ip_queue.c: * (C) 2000-2002 James Morris <jmorris@intercode.com.au> * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/proc_fs.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_queue.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> #include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> #include <net/netns/generic.h> #include <linux/atomic.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFQNL_QMAX_DEFAULT 1024 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we * support is 65531 bytes. We send truncated packets if the specified length * is larger than that. Userspace can check for presence of NFQA_CAP_LEN * attribute to detect truncation. */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; u32 peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; unsigned int queue_user_dropped; u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ /* * Following fields are dirtied for each queued packet, * keep them in same cache line if possible. */ spinlock_t lock ____cacheline_aligned_in_smp; unsigned int queue_total; unsigned int id_sequence; /* 'sequence' of pkt ids */ struct list_head queue_list; /* packets in queue */ }; typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 struct nfnl_queue_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; }; static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) { return net_generic(net, nfnl_queue_net_id); } static inline u_int8_t instance_hashfn(u_int16_t queue_num) { return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; struct nfqnl_instance *inst; head = &q->instance_table[instance_hashfn(queue_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; } return NULL; } static struct nfqnl_instance * instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) { struct nfqnl_instance *inst; unsigned int h; int err; spin_lock(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; inst->copy_range = NFQNL_MAX_COPY_RANGE; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; goto out_free; } h = instance_hashfn(queue_num); hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: spin_unlock(&q->instances_lock); return ERR_PTR(err); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); static void instance_destroy_rcu(struct rcu_head *head) { struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); rcu_read_lock(); nfqnl_flush(inst, NULL, 0); rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); call_rcu(&inst->rcu, instance_destroy_rcu); } static void instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { spin_lock(&q->instances_lock); __instance_destroy(inst); spin_unlock(&q->instances_lock); } static inline void __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_add_tail(&entry->list, &queue->queue_list); queue->queue_total++; } static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { struct nf_queue_entry *entry = NULL, *i; spin_lock_bh(&queue->lock); list_for_each_entry(i, &queue->queue_list, list) { if (i->id == id) { entry = i; break; } } if (entry) __dequeue_entry(queue, entry); spin_unlock_bh(&queue->lock); return entry; } static unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *hooks, unsigned int *index) { const struct nf_hook_entry *hook; unsigned int verdict, i = *index; while (i < hooks->num_hook_entries) { hook = &hooks->hooks[i]; repeat: verdict = nf_hook_entry_hookfn(hook, skb, state); if (verdict != NF_ACCEPT) { *index = i; if (verdict != NF_REPEAT) return verdict; goto repeat; } i++; } *index = i; return NF_ACCEPT; } static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) { switch (pf) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: return rcu_dereference(net->nf.hooks_bridge[hooknum]); #endif case NFPROTO_IPV4: return rcu_dereference(net->nf.hooks_ipv4[hooknum]); case NFPROTO_IPV6: return rcu_dereference(net->nf.hooks_ipv6[hooknum]); default: WARN_ON_ONCE(1); return NULL; } return NULL; } static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { #ifdef CONFIG_INET const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos && skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) return ip_route_me_harder(entry->state.net, entry->state.sk, skb, RTN_UNSPEC); } #endif return 0; } static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) { const struct nf_ipv6_ops *v6ops; int ret = 0; switch (entry->state.pf) { case AF_INET: ret = nf_ip_reroute(skb, entry); break; case AF_INET6: v6ops = rcu_dereference(nf_ipv6_ops); if (v6ops) ret = v6ops->reroute(skb, entry); break; } return ret; } /* caller must hold rcu read-side lock */ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_hook_entry *hook_entry; const struct nf_hook_entries *hooks; struct sk_buff *skb = entry->skb; const struct net *net; unsigned int i; int err; u8 pf; net = entry->state.net; pf = entry->state.pf; hooks = nf_hook_entries_head(net, pf, entry->state.hook); i = entry->hook_index; if (!hooks || i >= hooks->num_hook_entries) { kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); nf_queue_entry_free(entry); return; } hook_entry = &hooks->hooks[i]; /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); if (verdict == NF_ACCEPT) { if (nf_reroute(skb, entry) < 0) verdict = NF_DROP; } if (verdict == NF_ACCEPT) { next_hook: ++i; verdict = nf_iterate(skb, &entry->state, hooks, &i); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: local_bh_disable(); entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: err = nf_queue(skb, &entry->state, i, verdict); if (err == 1) goto next_hook; break; case NF_STOLEN: break; default: kfree_skb(skb); } nf_queue_entry_free(entry); } static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { unsigned int ct_verdict = verdict; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); switch (ct_verdict & NF_VERDICT_MASK) { case NF_ACCEPT: /* follow userspace verdict, could be REPEAT */ break; case NF_STOLEN: nf_queue_entry_free(entry); return; default: verdict = ct_verdict & NF_VERDICT_MASK; break; } } nf_reinject(entry, verdict); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) { struct nf_queue_entry *entry, *next; spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, next, &queue->queue_list, list) { if (!cmpfn || cmpfn(entry, data)) { list_del(&entry->list); queue->queue_total--; nfqnl_reinject(entry, NF_DROP); } } spin_unlock_bh(&queue->lock); } static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, bool csum_verify) { __u32 flags = 0; if (packet->ip_summed == CHECKSUM_PARTIAL) flags = NFQA_SKB_CSUMNOTREADY; else if (csum_verify) flags = NFQA_SKB_CSUM_NOTVERIFIED; if (skb_is_gso(packet)) flags |= NFQA_SKB_GSO; return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; } static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { const struct cred *cred; if (!sk_fullsock(sk)) return 0; read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { cred = sk->sk_socket->file->f_cred; if (nla_put_be32(skb, NFQA_UID, htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) goto nla_put_failure; if (nla_put_be32(skb, NFQA_GID, htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) goto nla_put_failure; } read_unlock_bh(&sk->sk_callback_lock); return 0; nla_put_failure: read_unlock_bh(&sk->sk_callback_lock); return -1; } static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) { #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) if (sk && sk_fullsock(sk)) { u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data); if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid))) return -1; } #endif return 0; } static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) { int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) seclen = security_secid_to_secctx(skb->secmark, ctx); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; } static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) { struct sk_buff *entskb = entry->skb; u32 nlalen = 0; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + nla_total_size(sizeof(__be16))); if (entskb->network_header > entskb->mac_header) nlalen += nla_total_size((entskb->network_header - entskb->mac_header)); return nlalen; } static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) { struct sk_buff *entskb = entry->skb; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) { struct nlattr *nest; nest = nla_nest_start(skb, NFQA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) || nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto)) goto nla_put_failure; nla_nest_end(skb, nest); } if (entskb->mac_header < entskb->network_header) { int len = (int)(entskb->network_header - entskb->mac_header); if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static int nf_queue_checksum_help(struct sk_buff *entskb) { if (skb_csum_is_sctp(entskb)) return skb_crc32c_csum_help(entskb); return skb_checksum_help(entskb); } static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { size_t size; size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; struct lsm_context ctx = { NULL, 0, 0 }; int seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* priority */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + nla_total_size(sizeof(u_int32_t)) /* classid */ #endif + nla_total_size(sizeof(u_int32_t)); /* cap_len */ tstamp = skb_tstamp_cond(entskb, false); if (tstamp) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); size += nfqnl_get_bridge_size(entry); if (entry->state.hook <= NF_INET_FORWARD || (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); else csum_verify = false; outdev = entry->state.out; switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && nf_queue_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); if (data_len > entskb->len) data_len = entskb->len; hlen = skb_zerocopy_headlen(entskb); hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; } nfnl_ct = rcu_dereference(nfnl_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t))); /* gid */ } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { seclen = nfqnl_get_sk_secctx(entskb, &ctx); if (seclen < 0) return NULL; if (seclen) size += nla_total_size(seclen); } skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); goto nlmsg_failure; } nlh = nfnl_msg_put(skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), 0, entry->state.pf, NFNETLINK_V0, htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); pmsg->hw_protocol = entskb->protocol; pmsg->hook = entry->state.hook; *packet_id_ptr = &pmsg->packet_id; indev = entry->state.in; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physinif = nf_bridge_get_physinif(entskb); if (physinif && nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(physinif))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { int physoutif; /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutif = nf_bridge_get_physoutif(entskb); if (physoutif && nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(physoutif))) goto nla_put_failure; } #endif } if (entskb->mark && nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) goto nla_put_failure; if (entskb->priority && nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority))) goto nla_put_failure; if (indev && entskb->dev && skb_mac_header_was_set(entskb) && skb_mac_header_len(entskb) != 0) { struct nfqnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; if (entry->state.hook <= NF_INET_FORWARD && tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) goto nla_put_failure; if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) goto nla_put_failure; if (cap_len > data_len && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; if (nfqnl_put_packet_info(skb, entskb, csum_verify)) goto nla_put_failure; if (data_len) { struct nlattr *nla; if (skb_tailroom(skb) < sizeof(*nla) + hlen) goto nla_put_failure; nla = skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); if (skb_zerocopy(skb, entskb, data_len, hlen)) goto nla_put_failure; } nlh->nlmsg_len = skb->len; if (seclen >= 0) security_release_secctx(&ctx); return skb; nla_put_failure: skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: if (seclen >= 0) security_release_secctx(&ctx); return NULL; } static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; struct nf_conn *ct = (void *)skb_nfct(entry->skb); unsigned long status; unsigned int use; if (!ct) return false; status = READ_ONCE(ct->status); if ((status & flags) == IPS_DYING) return true; if (status & IPS_CONFIRMED) return false; /* in some cases skb_clone() can occur after initial conntrack * pickup, but conntrack assumes exclusive skb->_nfct ownership for * unconfirmed entries. * * This happens for br_netfilter and with ip multicast routing. * We can't be solved with serialization here because one clone could * have been queued for local delivery. */ use = refcount_read(&ct->ct_general.use); if (likely(use == 1)) return false; /* Can't decrement further? Exclusive ownership. */ if (!refcount_dec_not_one(&ct->ct_general.use)) return false; skb_set_nfct(entry->skb, 0); /* No nf_ct_put(): we already decremented .use and it cannot * drop down to 0. */ return true; #endif return false; } static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) { struct sk_buff *nskb; int err = -ENOBUFS; __be32 *packet_id_ptr; int failopen = 0; nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; goto err_out; } spin_lock_bh(&queue->lock); if (nf_ct_drop_unconfirmed(entry)) goto err_out_free_nskb; if (queue->queue_total >= queue->queue_maxlen) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_dropped++; net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", queue->queue_total); } goto err_out_free_nskb; } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_user_dropped++; } goto err_out_unlock; } __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); return 0; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); if (failopen) nfqnl_reinject(entry, NF_ACCEPT); err_out: return err; } static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); if (!entry) return NULL; if (nf_queue_entry_get_refs(entry)) return entry; kfree(entry); return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* When called from bridge netfilter, skb->data must point to MAC header * before calling skb_gso_segment(). Else, original MAC header is lost * and segmented skbs will be sent to wrong destination. */ static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_push(skb, skb->network_header - skb->mac_header); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_pull(skb, skb->network_header - skb->mac_header); } #else #define nf_bridge_adjust_skb_data(s) do {} while (0) #define nf_bridge_adjust_segmented_data(s) do {} while (0) #endif static int __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, struct sk_buff *skb, struct nf_queue_entry *entry) { int ret = -ENOMEM; struct nf_queue_entry *entry_seg; nf_bridge_adjust_segmented_data(skb); if (skb->next == NULL) { /* last packet, no need to copy entry */ struct sk_buff *gso_skb = entry->skb; entry->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry); if (ret) entry->skb = gso_skb; return ret; } skb_mark_not_on_list(skb); entry_seg = nf_queue_entry_dup(entry); if (entry_seg) { entry_seg->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry_seg); if (ret) nf_queue_entry_free(entry_seg); } return ret; } static int nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); /* rcu_read_lock()ed by nf_hook_thresh */ queue = instance_lookup(q, queuenum); if (!queue) return -ESRCH; if (queue->copy_mode == NFQNL_COPY_NONE) return -EINVAL; skb = entry->skb; switch (entry->state.pf) { case NFPROTO_IPV4: skb->protocol = htons(ETH_P_IP); break; case NFPROTO_IPV6: skb->protocol = htons(ETH_P_IPV6); break; } if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); segs = skb_gso_segment(skb, 0); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ESRCH to * mean 'ignore this hook'. */ if (IS_ERR_OR_NULL(segs)) goto out_err; queued = 0; err = 0; skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); if (err == 0) queued++; else kfree_skb(segs); } if (queued) { if (err) /* some segments are already queued */ nf_queue_entry_free(entry); kfree_skb(skb); return 0; } out_err: nf_bridge_adjust_segmented_data(skb); return err; } static int nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { unsigned int min_len = skb_transport_offset(e->skb); if (data_len < min_len) return -EINVAL; if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { if (data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); if (!nskb) return -ENOMEM; kfree_skb(e->skb); e->skb = nskb; } skb_put(e->skb, diff); } if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; } static int nfqnl_set_mode(struct nfqnl_instance *queue, unsigned char mode, unsigned int range) { int status = 0; spin_lock_bh(&queue->lock); switch (mode) { case NFQNL_COPY_NONE: case NFQNL_COPY_META: queue->copy_mode = mode; queue->copy_range = 0; break; case NFQNL_COPY_PACKET: queue->copy_mode = mode; if (range == 0 || range > NFQNL_MAX_COPY_RANGE) queue->copy_range = NFQNL_MAX_COPY_RANGE; else queue->copy_range = range; break; default: status = -EINVAL; } spin_unlock_bh(&queue->lock); return status; } static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int physinif, physoutif; physinif = nf_bridge_get_physinif(entry->skb); physoutif = nf_bridge_get_physoutif(entry->skb); if (physinif == ifindex || physoutif == ifindex) return 1; #endif if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; if (entry->state.out) if (entry->state.out->ifindex == ifindex) return 1; return 0; } /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void nfqnl_dev_drop(struct net *net, int ifindex) { int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); } rcu_read_unlock(); } static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; static void nfqnl_nf_hook_drop(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; /* This function is also called on net namespace error unwind, * when pernet_ops->init() failed and ->exit() functions of the * previous pernet_ops gets called. * * This may result in a call to nfqnl_nf_hook_drop() before * struct nfnl_queue_net was allocated. */ if (!q) return; for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, NULL, 0); } } static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = { [NFQA_VLAN_TCI] = { .type = NLA_U16}, [NFQA_VLAN_PROTO] = { .type = NLA_U16}, }; static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, [NFQA_VLAN] = { .type = NLA_NESTED }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static struct nfqnl_instance * verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) { struct nfqnl_instance *queue; queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); if (queue->peer_portid != nlportid) return ERR_PTR(-EPERM); return queue; } static struct nfqnl_msg_verdict_hdr* verdicthdr_get(const struct nlattr * const nfqa[]) { struct nfqnl_msg_verdict_hdr *vhdr; unsigned int verdict; if (!nfqa[NFQA_VERDICT_HDR]) return NULL; vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) return NULL; return vhdr; } static int nfq_id_after(unsigned int id, unsigned int max) { return (int)(id - max) > 0; } static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict, maxid; LIST_HEAD(batch_list); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); maxid = ntohl(vhdr->id); spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { if (nfq_id_after(entry->id, maxid)) break; __dequeue_entry(queue, entry); list_add_tail(&entry->list, &batch_list); } spin_unlock_bh(&queue->lock); if (list_empty(&batch_list)) return -ENOENT; list_for_each_entry_safe(entry, tmp, &batch_list, list) { if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); } return 0; } static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[], struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0) return NULL; if (nfqa[NFQA_EXP]) nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; #else return NULL; #endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, const struct nlattr * const nfqa[]) { if (nfqa[NFQA_VLAN]) { struct nlattr *tb[NFQA_VLAN_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], nfqa_vlan_policy, NULL); if (err < 0) return err; if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; __vlan_hwaccel_put_tag(entry->skb, nla_get_be16(tb[NFQA_VLAN_PROTO]), ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { int mac_header_len = entry->skb->network_header - entry->skb->mac_header; if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) return -EINVAL; else if (mac_header_len > 0) memcpy(skb_mac_header(entry->skb), nla_data(nfqa[NFQA_L2HDR]), mac_header_len); } return 0; } static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); const struct nfnl_ct_hook *nfnl_ct; struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; struct nf_queue_entry *entry; struct nf_conn *ct = NULL; unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; /* rcu lock already held from nfnl->call_rcu. */ nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, &ctinfo); } if (entry->state.pf == PF_BRIDGE) { err = nfqa_parse_bridge(entry, nfqa); if (err < 0) return err; } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); int diff = payload_len - entry->skb->len; if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), payload_len, entry, diff) < 0) verdict = NF_DROP; if (ct && diff) nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); } if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); return 0; } static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { return -ENOTSUPP; } static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 }, [NFQA_CFG_MASK] = { .type = NLA_U32 }, [NFQA_CFG_FLAGS] = { .type = NLA_U32 }, }; static const struct nf_queue_handler nfqh = { .outfn = nfqnl_enqueue_packet, .nf_hook_drop = nfqnl_nf_hook_drop, }; static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); /* Obsolete commands without queue context */ switch (cmd->command) { case NFQNL_CFG_CMD_PF_BIND: return 0; case NFQNL_CFG_CMD_PF_UNBIND: return 0; } } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfqa[NFQA_CFG_FLAGS]) { if (!nfqa[NFQA_CFG_MASK]) { /* A mask is needed to specify which flags are being * changed. */ return -EINVAL; } flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); if (flags >= NFQA_CFG_F_MAX) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NETWORK_SECMARK) if (flags & mask & NFQA_CFG_F_SECCTX) return -EOPNOTSUPP; #endif if ((flags & mask & NFQA_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_QUEUE); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_QUEUE); if (rcu_access_pointer(nfnl_ct_hook)) return -EAGAIN; #endif return -EOPNOTSUPP; } } rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; } if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: if (queue) { ret = -EBUSY; goto err_out_unlock; } queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; } break; case NFQNL_CFG_CMD_UNBIND: if (!queue) { ret = -ENODEV; goto err_out_unlock; } instance_destroy(q, queue); goto err_out_unlock; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: ret = -ENOTSUPP; goto err_out_unlock; } } if (!queue) { ret = -ENODEV; goto err_out_unlock; } if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = nla_data(nfqa[NFQA_CFG_PARAMS]); nfqnl_set_mode(queue, params->copy_mode, ntohl(params->copy_range)); } if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); spin_lock_bh(&queue->lock); queue->queue_maxlen = ntohl(*queue_maxlen); spin_unlock_bh(&queue->lock); } if (nfqa[NFQA_CFG_FLAGS]) { spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; spin_unlock_bh(&queue->lock); } err_out_unlock: rcu_read_unlock(); return ret; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_policy }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFQA_CFG_MAX, .policy = nfqa_cfg_policy }, [NFQNL_MSG_VERDICT_BATCH] = { .call = nfqnl_recv_verdict_batch, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_batch_policy }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { .name = "nf_queue", .subsys_id = NFNL_SUBSYS_QUEUE, .cb_count = NFQNL_MSG_MAX, .cb = nfqnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; struct net *net; struct nfnl_queue_net *q; if (!st) return NULL; net = seq_file_net(seq); q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&q->instance_table[st->bucket])) return q->instance_table[st->bucket].first; } return NULL; } static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; struct net *net = seq_file_net(seq); h = h->next; while (!h) { struct nfnl_queue_net *q; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; q = nfnl_queue_pernet(net); h = q->instance_table[st->bucket].first; } return h; } static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head; head = get_first(seq); if (head) while (pos && (head = get_next(seq, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(s, v); } static void seq_stop(struct seq_file *s, void *v) __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", inst->queue_num, inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); return 0; } static const struct seq_operations nfqnl_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_queue_net_init(struct net *net) { unsigned int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&q->instance_table[i]); spin_lock_init(&q->instances_lock); #ifdef CONFIG_PROC_FS if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter, &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif return 0; } static void __net_exit nfnl_queue_net_exit(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static struct pernet_operations nfnl_queue_net_ops = { .init = nfnl_queue_net_init, .exit = nfnl_queue_net_exit, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), }; static int __init nfnetlink_queue_init(void) { int status; status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = register_netdevice_notifier(&nfqnl_dev_notifier); if (status < 0) { pr_err("failed to register netdevice notifier\n"); goto cleanup_netlink_subsys; } nf_register_queue_handler(&nfqh); return status; cleanup_netlink_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); out: return status; } static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_DESCRIPTION("netfilter packet queue handler"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); module_init(nfnetlink_queue_init); module_exit(nfnetlink_queue_fini);
74 5 73 9 50 74 5 74 21 17 3 1 1 5 21 21 27 30 30 9 20 27 16 1 8 2 3 3 8 2 4 17 17 12 3 10 3 11 8 1 6 445 442 423 15 429 423 18 1 14 3 12 5 10 7 9 8 8 9 14 3 13 13 13 5 5 604 666 4 1 1 1 664 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder modified to support * virtual tunnel interface * * Authors: * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 */ /* This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> static struct rtnl_link_ops vti_link_ops __read_mostly; static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; if (update_skb_dev) skb->dev = tunnel->dev; return xfrm_input(skb, nexthdr, spi, encap_type); } return -EINVAL; drop: kfree_skb(skb); return 0; } static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return vti_input(skb, nexthdr, spi, encap_type, false); } static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } static int vti_rcv_proto(struct sk_buff *skb) { return vti_rcv(skb, 0, false); } static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; if (!tunnel) return 1; dev = tunnel->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) { xfrm_address_t *daddr = (xfrm_address_t *)&dst; xfrm_address_t *saddr = (xfrm_address_t *)&src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET) return false; if (!dst) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) return false; return true; } static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; int err; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst = &rt->dst; skb_dst_set(skb, dst); break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } skb_dst_set(skb, dst); break; #endif default: DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } } dst_hold(dst); dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { DEV_STATS_INC(dev, tx_carrier_errors); dst_release(dst); goto tx_error_icmp; } tdev = dst->dev; if (tdev == dev) { dst_release(dst); DEV_STATS_INC(dev, collisions); goto tx_error; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } else { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } dst_release(dst); goto tx_error; } xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } /* This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip_tunnel *tunnel; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; __set_bit(IP_TUNNEL_NO_KEY_BIT, flags); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; mark = be32_to_cpu(tunnel->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || p->iph.ihl != 5) return -EINVAL; } if (!ip_tunnel_flags_is_be16_compat(p->i_flags) || !ip_tunnel_flags_is_be16_compat(p->o_flags)) return -EOVERFLOW; if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY)) p->i_key = 0; if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY)) p->o_key = 0; __set_bit(IP_TUNNEL_VTI_BIT, flags); ip_tunnel_flags_copy(p->i_flags, flags); err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { ip_tunnel_flags_from_be16(flags, GRE_KEY); ip_tunnel_flags_or(p->i_flags, p->i_flags, flags); ip_tunnel_flags_or(p->o_flags, p->o_flags, flags); } return 0; } static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } static int vti_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; dev->addr_len = 4; dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); } static void __net_init vti_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int vti_rcv_tunnel(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); } static struct xfrm_tunnel vti_ipip_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #endif #endif static int __net_init vti_init_net(struct net *net) { int err; struct ip_tunnel_net *itn; err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); if (err) return err; itn = net_generic(net, vti_net_id); if (itn->fb_tunnel_dev) vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; } static void __net_exit vti_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_IPIP; if (!data) return; __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags); if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_FWMARK]) *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; __u32 fwmark = t->fwmark; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) || nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) || nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark)) return -EMSGSIZE; return 0; } static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { .kind = "vti", .maxtype = IFLA_VTI_MAX, .policy = vti_policy, .priv_size = sizeof(struct ip_tunnel), .setup = vti_tunnel_setup, .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static int __init vti_init(void) { const char *msg; int err; pr_info("IPv4 over IPsec tunneling driver\n"); msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; #if IS_ENABLED(CONFIG_IPV6) err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif #endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); xfrm_tunnel_ipip6_failed: #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } module_init(vti_init); module_exit(vti_fini); MODULE_DESCRIPTION("Virtual (secure) IP tunneling library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0");
131 1 130 83 82 82 71 70 82 82 71 70 82 2 1 5 5 7 6385 6386 193 193 193 193 193 6683 4916 1904 7 16 23 86 2 21 84 90 1 5 83 14 82 15 1 69 73 71 69 72 71 70 71 2 70 69 69 70 27 70 82 3 1 5 71 9 70 2 74 11 48 2 1 69 69 5 5 5 197 1 31 2 1 1 24 28 121 3 6 5 7 2 111 5 89 1 1 109 7 101 107 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 // SPDX-License-Identifier: GPL-2.0-only /* * Implementation of the policy database. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * * Updated: Frank Mayer <mayerf@tresys.com> and * Karl MacMillan <kmacmillan@tresys.com> * Added conditional policy language extensions * Copyright (C) 2003-2004 Tresys Technology, LLC * * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support for the policy capability bitmap * Copyright (C) 2007 Hewlett-Packard Development Company, L.P. * * Update: Mellanox Techonologies * Added Infiniband support * Copyright (C) 2016 Mellanox Techonologies */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/audit.h> #include "security.h" #include "policydb.h" #include "conditional.h" #include "mls.h" #include "services.h" #ifdef CONFIG_SECURITY_SELINUX_DEBUG /* clang-format off */ static const char *const symtab_name[SYM_NUM] = { "common prefixes", "classes", "roles", "types", "users", "bools", "levels", "categories", }; /* clang-format off */ #endif struct policydb_compat_info { unsigned int version; unsigned int sym_num; unsigned int ocon_num; }; /* These need to be updated if SYM_NUM or OCON_NUM changes */ static const struct policydb_compat_info policydb_compat[] = { { .version = POLICYDB_VERSION_BASE, .sym_num = SYM_NUM - 3, .ocon_num = OCON_NUM - 3, }, { .version = POLICYDB_VERSION_BOOL, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM - 3, }, { .version = POLICYDB_VERSION_IPV6, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_NLCLASS, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_MLS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_AVTAB, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_RANGETRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_POLCAP, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_PERMISSIVE, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_BOUNDARY, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_FILENAME_TRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_ROLETRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_NEW_OBJECT_DEFAULTS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_DEFAULT_TYPE, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_CONSTRAINT_NAMES, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_XPERMS_IOCTL, .sym_num = SYM_NUM, .ocon_num = OCON_NUM - 2, }, { .version = POLICYDB_VERSION_INFINIBAND, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, { .version = POLICYDB_VERSION_GLBLUB, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, { .version = POLICYDB_VERSION_COMP_FTRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, { .version = POLICYDB_VERSION_COND_XPERMS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, }, }; static const struct policydb_compat_info * policydb_lookup_compat(unsigned int version) { unsigned int i; for (i = 0; i < ARRAY_SIZE(policydb_compat); i++) { if (policydb_compat[i].version == version) return &policydb_compat[i]; } return NULL; } /* * The following *_destroy functions are used to * free any memory allocated for each kind of * symbol data in the policy database. */ static int perm_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } static int common_destroy(void *key, void *datum, void *p) { struct common_datum *comdatum; kfree(key); if (datum) { comdatum = datum; hashtab_map(&comdatum->permissions.table, perm_destroy, NULL); hashtab_destroy(&comdatum->permissions.table); } kfree(datum); return 0; } static void constraint_expr_destroy(struct constraint_expr *expr) { if (expr) { ebitmap_destroy(&expr->names); if (expr->type_names) { ebitmap_destroy(&expr->type_names->types); ebitmap_destroy(&expr->type_names->negset); kfree(expr->type_names); } kfree(expr); } } static int cls_destroy(void *key, void *datum, void *p) { struct class_datum *cladatum; struct constraint_node *constraint, *ctemp; struct constraint_expr *e, *etmp; kfree(key); if (datum) { cladatum = datum; hashtab_map(&cladatum->permissions.table, perm_destroy, NULL); hashtab_destroy(&cladatum->permissions.table); constraint = cladatum->constraints; while (constraint) { e = constraint->expr; while (e) { etmp = e; e = e->next; constraint_expr_destroy(etmp); } ctemp = constraint; constraint = constraint->next; kfree(ctemp); } constraint = cladatum->validatetrans; while (constraint) { e = constraint->expr; while (e) { etmp = e; e = e->next; constraint_expr_destroy(etmp); } ctemp = constraint; constraint = constraint->next; kfree(ctemp); } kfree(cladatum->comkey); } kfree(datum); return 0; } static int role_destroy(void *key, void *datum, void *p) { struct role_datum *role; kfree(key); if (datum) { role = datum; ebitmap_destroy(&role->dominates); ebitmap_destroy(&role->types); } kfree(datum); return 0; } static int type_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } static int user_destroy(void *key, void *datum, void *p) { struct user_datum *usrdatum; kfree(key); if (datum) { usrdatum = datum; ebitmap_destroy(&usrdatum->roles); ebitmap_destroy(&usrdatum->range.level[0].cat); ebitmap_destroy(&usrdatum->range.level[1].cat); ebitmap_destroy(&usrdatum->dfltlevel.cat); } kfree(datum); return 0; } static int sens_destroy(void *key, void *datum, void *p) { struct level_datum *levdatum; kfree(key); if (datum) { levdatum = datum; ebitmap_destroy(&levdatum->level.cat); } kfree(datum); return 0; } static int cat_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } /* clang-format off */ static int (*const destroy_f[SYM_NUM])(void *key, void *datum, void *datap) = { common_destroy, cls_destroy, role_destroy, type_destroy, user_destroy, cond_destroy_bool, sens_destroy, cat_destroy, }; /* clang-format on */ static int filenametr_destroy(void *key, void *datum, void *p) { struct filename_trans_key *ft = key; struct filename_trans_datum *next, *d = datum; kfree(ft->name); kfree(key); do { ebitmap_destroy(&d->stypes); next = d->next; kfree(d); d = next; } while (unlikely(d)); cond_resched(); return 0; } static int range_tr_destroy(void *key, void *datum, void *p) { struct mls_range *rt = datum; kfree(key); ebitmap_destroy(&rt->level[0].cat); ebitmap_destroy(&rt->level[1].cat); kfree(datum); cond_resched(); return 0; } static int role_tr_destroy(void *key, void *datum, void *p) { kfree(key); kfree(datum); return 0; } static void ocontext_destroy(struct ocontext *c, unsigned int i) { if (!c) return; context_destroy(&c->context[0]); context_destroy(&c->context[1]); if (i == OCON_ISID || i == OCON_FS || i == OCON_NETIF || i == OCON_FSUSE) kfree(c->u.name); kfree(c); } /* * Initialize the role table. */ static int roles_init(struct policydb *p) { char *key = NULL; int rc; struct role_datum *role; role = kzalloc(sizeof(*role), GFP_KERNEL); if (!role) return -ENOMEM; rc = -EINVAL; role->value = ++p->p_roles.nprim; if (role->value != OBJECT_R_VAL) goto out; rc = -ENOMEM; key = kstrdup(OBJECT_R, GFP_KERNEL); if (!key) goto out; rc = symtab_insert(&p->p_roles, key, role); if (rc) goto out; return 0; out: kfree(key); kfree(role); return rc; } static u32 filenametr_hash(const void *k) { const struct filename_trans_key *ft = k; unsigned long salt = ft->ttype ^ ft->tclass; return full_name_hash((void *)salt, ft->name, strlen(ft->name)); } static int filenametr_cmp(const void *k1, const void *k2) { const struct filename_trans_key *ft1 = k1; const struct filename_trans_key *ft2 = k2; int v; v = ft1->ttype - ft2->ttype; if (v) return v; v = ft1->tclass - ft2->tclass; if (v) return v; return strcmp(ft1->name, ft2->name); } static const struct hashtab_key_params filenametr_key_params = { .hash = filenametr_hash, .cmp = filenametr_cmp, }; struct filename_trans_datum * policydb_filenametr_search(struct policydb *p, struct filename_trans_key *key) { return hashtab_search(&p->filename_trans, key, filenametr_key_params); } static u32 rangetr_hash(const void *k) { const struct range_trans *key = k; return key->source_type + (key->target_type << 3) + (key->target_class << 5); } static int rangetr_cmp(const void *k1, const void *k2) { const struct range_trans *key1 = k1, *key2 = k2; int v; v = key1->source_type - key2->source_type; if (v) return v; v = key1->target_type - key2->target_type; if (v) return v; v = key1->target_class - key2->target_class; return v; } static const struct hashtab_key_params rangetr_key_params = { .hash = rangetr_hash, .cmp = rangetr_cmp, }; struct mls_range *policydb_rangetr_search(struct policydb *p, struct range_trans *key) { return hashtab_search(&p->range_tr, key, rangetr_key_params); } static u32 role_trans_hash(const void *k) { const struct role_trans_key *key = k; return jhash_3words(key->role, key->type, (u32)key->tclass << 16 | key->tclass, 0); } static int role_trans_cmp(const void *k1, const void *k2) { const struct role_trans_key *key1 = k1, *key2 = k2; int v; v = key1->role - key2->role; if (v) return v; v = key1->type - key2->type; if (v) return v; return key1->tclass - key2->tclass; } static const struct hashtab_key_params roletr_key_params = { .hash = role_trans_hash, .cmp = role_trans_cmp, }; struct role_trans_datum *policydb_roletr_search(struct policydb *p, struct role_trans_key *key) { return hashtab_search(&p->role_tr, key, roletr_key_params); } /* * Initialize a policy database structure. */ static void policydb_init(struct policydb *p) { memset(p, 0, sizeof(*p)); avtab_init(&p->te_avtab); cond_policydb_init(p); ebitmap_init(&p->filename_trans_ttypes); ebitmap_init(&p->policycaps); ebitmap_init(&p->permissive_map); } /* * The following *_index functions are used to * define the val_to_name and val_to_struct arrays * in a policy database structure. The val_to_name * arrays are used when converting security context * structures into string representations. The * val_to_struct arrays are used when the attributes * of a class, role, or user are needed. */ static int common_index(void *key, void *datum, void *datap) { struct policydb *p; struct common_datum *comdatum; comdatum = datum; p = datap; if (!comdatum->value || comdatum->value > p->p_commons.nprim) return -EINVAL; p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key; return 0; } static int class_index(void *key, void *datum, void *datap) { struct policydb *p; struct class_datum *cladatum; cladatum = datum; p = datap; if (!cladatum->value || cladatum->value > p->p_classes.nprim) return -EINVAL; p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key; p->class_val_to_struct[cladatum->value - 1] = cladatum; return 0; } static int role_index(void *key, void *datum, void *datap) { struct policydb *p; struct role_datum *role; role = datum; p = datap; if (!role->value || role->value > p->p_roles.nprim || role->bounds > p->p_roles.nprim) return -EINVAL; p->sym_val_to_name[SYM_ROLES][role->value - 1] = key; p->role_val_to_struct[role->value - 1] = role; return 0; } static int type_index(void *key, void *datum, void *datap) { struct policydb *p; struct type_datum *typdatum; typdatum = datum; p = datap; if (typdatum->primary) { if (!typdatum->value || typdatum->value > p->p_types.nprim || typdatum->bounds > p->p_types.nprim) return -EINVAL; p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key; p->type_val_to_struct[typdatum->value - 1] = typdatum; } return 0; } static int user_index(void *key, void *datum, void *datap) { struct policydb *p; struct user_datum *usrdatum; usrdatum = datum; p = datap; if (!usrdatum->value || usrdatum->value > p->p_users.nprim || usrdatum->bounds > p->p_users.nprim) return -EINVAL; p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key; p->user_val_to_struct[usrdatum->value - 1] = usrdatum; return 0; } static int sens_index(void *key, void *datum, void *datap) { struct policydb *p; struct level_datum *levdatum; levdatum = datum; p = datap; if (!levdatum->isalias) { if (!levdatum->level.sens || levdatum->level.sens > p->p_levels.nprim) return -EINVAL; p->sym_val_to_name[SYM_LEVELS][levdatum->level.sens - 1] = key; } return 0; } static int cat_index(void *key, void *datum, void *datap) { struct policydb *p; struct cat_datum *catdatum; catdatum = datum; p = datap; if (!catdatum->isalias) { if (!catdatum->value || catdatum->value > p->p_cats.nprim) return -EINVAL; p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key; } return 0; } /* clang-format off */ static int (*const index_f[SYM_NUM])(void *key, void *datum, void *datap) = { common_index, class_index, role_index, type_index, user_index, cond_index_bool, sens_index, cat_index, }; /* clang-format on */ #ifdef CONFIG_SECURITY_SELINUX_DEBUG static void hash_eval(struct hashtab *h, const char *hash_name, const char *hash_details) { struct hashtab_info info; hashtab_stat(h, &info); pr_debug( "SELinux: %s%s%s: %d entries and %d/%d buckets used, longest chain length %d, sum of chain length^2 %llu\n", hash_name, hash_details ? "@" : "", hash_details ?: "", h->nel, info.slots_used, h->size, info.max_chain_len, info.chain2_len_sum); } static void symtab_hash_eval(struct symtab *s) { int i; for (i = 0; i < SYM_NUM; i++) hash_eval(&s[i].table, symtab_name[i], NULL); } #else static inline void hash_eval(struct hashtab *h, const char *hash_name, const char *hash_details) { } static inline void symtab_hash_eval(struct symtab *s) { } #endif /* CONFIG_SECURITY_SELINUX_DEBUG */ /* * Define the other val_to_name and val_to_struct arrays * in a policy database structure. * * Caller must clean up on failure. */ static int policydb_index(struct policydb *p) { int i, rc; if (p->mls_enabled) pr_debug( "SELinux: %d users, %d roles, %d types, %d bools, %d sens, %d cats\n", p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim); else pr_debug("SELinux: %d users, %d roles, %d types, %d bools\n", p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, p->p_bools.nprim); pr_debug("SELinux: %d classes, %d rules\n", p->p_classes.nprim, p->te_avtab.nel); avtab_hash_eval(&p->te_avtab, "rules"); symtab_hash_eval(p->symtab); p->class_val_to_struct = kcalloc(p->p_classes.nprim, sizeof(*p->class_val_to_struct), GFP_KERNEL); if (!p->class_val_to_struct) return -ENOMEM; p->role_val_to_struct = kcalloc( p->p_roles.nprim, sizeof(*p->role_val_to_struct), GFP_KERNEL); if (!p->role_val_to_struct) return -ENOMEM; p->user_val_to_struct = kcalloc( p->p_users.nprim, sizeof(*p->user_val_to_struct), GFP_KERNEL); if (!p->user_val_to_struct) return -ENOMEM; p->type_val_to_struct = kvcalloc( p->p_types.nprim, sizeof(*p->type_val_to_struct), GFP_KERNEL); if (!p->type_val_to_struct) return -ENOMEM; rc = cond_init_bool_indexes(p); if (rc) goto out; for (i = 0; i < SYM_NUM; i++) { p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim, sizeof(char *), GFP_KERNEL); if (!p->sym_val_to_name[i]) return -ENOMEM; rc = hashtab_map(&p->symtab[i].table, index_f[i], p); if (rc) goto out; } rc = 0; out: return rc; } /* * Free any memory allocated by a policy database structure. */ void policydb_destroy(struct policydb *p) { struct ocontext *c, *ctmp; struct genfs *g, *gtmp; u32 i; struct role_allow *ra, *lra = NULL; for (i = 0; i < SYM_NUM; i++) { cond_resched(); hashtab_map(&p->symtab[i].table, destroy_f[i], NULL); hashtab_destroy(&p->symtab[i].table); } for (i = 0; i < SYM_NUM; i++) kvfree(p->sym_val_to_name[i]); kfree(p->class_val_to_struct); kfree(p->role_val_to_struct); kfree(p->user_val_to_struct); kvfree(p->type_val_to_struct); avtab_destroy(&p->te_avtab); for (i = 0; i < OCON_NUM; i++) { cond_resched(); c = p->ocontexts[i]; while (c) { ctmp = c; c = c->next; ocontext_destroy(ctmp, i); } p->ocontexts[i] = NULL; } g = p->genfs; while (g) { cond_resched(); kfree(g->fstype); c = g->head; while (c) { ctmp = c; c = c->next; ocontext_destroy(ctmp, OCON_FSUSE); } gtmp = g; g = g->next; kfree(gtmp); } p->genfs = NULL; cond_policydb_destroy(p); hashtab_map(&p->role_tr, role_tr_destroy, NULL); hashtab_destroy(&p->role_tr); for (ra = p->role_allow; ra; ra = ra->next) { cond_resched(); kfree(lra); lra = ra; } kfree(lra); hashtab_map(&p->filename_trans, filenametr_destroy, NULL); hashtab_destroy(&p->filename_trans); hashtab_map(&p->range_tr, range_tr_destroy, NULL); hashtab_destroy(&p->range_tr); if (p->type_attr_map_array) { for (i = 0; i < p->p_types.nprim; i++) ebitmap_destroy(&p->type_attr_map_array[i]); kvfree(p->type_attr_map_array); } ebitmap_destroy(&p->filename_trans_ttypes); ebitmap_destroy(&p->policycaps); ebitmap_destroy(&p->permissive_map); } /* * Load the initial SIDs specified in a policy database * structure into a SID table. */ int policydb_load_isids(struct policydb *p, struct sidtab *s) { struct ocontext *head, *c; bool isid_init; int rc; rc = sidtab_init(s); if (rc) { pr_err("SELinux: out of memory on SID table init\n"); return rc; } isid_init = ebitmap_get_bit(&p->policycaps, POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT); head = p->ocontexts[OCON_ISID]; for (c = head; c; c = c->next) { u32 sid = c->sid[0]; const char *name = security_get_initial_sid_context(sid); if (sid == SECSID_NULL) { pr_err("SELinux: SID 0 was assigned a context.\n"); sidtab_destroy(s); return -EINVAL; } /* Ignore initial SIDs unused by this kernel. */ if (!name) continue; /* * Also ignore SECINITSID_INIT if the policy doesn't declare * support for it */ if (sid == SECINITSID_INIT && !isid_init) continue; rc = sidtab_set_initial(s, sid, &c->context[0]); if (rc) { pr_err("SELinux: unable to load initial SID %s.\n", name); sidtab_destroy(s); return rc; } /* * If the policy doesn't support the "userspace_initial_context" * capability, set SECINITSID_INIT to the same context as * SECINITSID_KERNEL. This ensures the same behavior as before * the reintroduction of SECINITSID_INIT, where all tasks * started before policy load would initially get the context * corresponding to SECINITSID_KERNEL. */ if (sid == SECINITSID_KERNEL && !isid_init) { rc = sidtab_set_initial(s, SECINITSID_INIT, &c->context[0]); if (rc) { pr_err("SELinux: unable to load initial SID %s.\n", name); sidtab_destroy(s); return rc; } } } return 0; } int policydb_class_isvalid(struct policydb *p, unsigned int class) { if (!class || class > p->p_classes.nprim) return 0; return 1; } int policydb_role_isvalid(struct policydb *p, unsigned int role) { if (!role || role > p->p_roles.nprim) return 0; return 1; } int policydb_type_isvalid(struct policydb *p, unsigned int type) { if (!type || type > p->p_types.nprim) return 0; return 1; } /* * Return 1 if the fields in the security context * structure `c' are valid. Return 0 otherwise. */ int policydb_context_isvalid(struct policydb *p, struct context *c) { struct role_datum *role; struct user_datum *usrdatum; if (!c->role || c->role > p->p_roles.nprim) return 0; if (!c->user || c->user > p->p_users.nprim) return 0; if (!c->type || c->type > p->p_types.nprim) return 0; if (c->role != OBJECT_R_VAL) { /* * Role must be authorized for the type. */ role = p->role_val_to_struct[c->role - 1]; if (!role || !ebitmap_get_bit(&role->types, c->type - 1)) /* role may not be associated with type */ return 0; /* * User must be authorized for the role. */ usrdatum = p->user_val_to_struct[c->user - 1]; if (!usrdatum) return 0; if (!ebitmap_get_bit(&usrdatum->roles, c->role - 1)) /* user may not be associated with role */ return 0; } if (!mls_context_isvalid(p, c)) return 0; return 1; } /* * Read a MLS range structure from a policydb binary * representation file. */ static int mls_read_range_helper(struct mls_range *r, struct policy_file *fp) { __le32 buf[2]; u32 items; int rc; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; rc = -EINVAL; items = le32_to_cpu(buf[0]); if (items > ARRAY_SIZE(buf)) { pr_err("SELinux: mls: range overflow\n"); goto out; } rc = next_entry(buf, fp, sizeof(u32) * items); if (rc) { pr_err("SELinux: mls: truncated range\n"); goto out; } r->level[0].sens = le32_to_cpu(buf[0]); if (items > 1) r->level[1].sens = le32_to_cpu(buf[1]); else r->level[1].sens = r->level[0].sens; rc = ebitmap_read(&r->level[0].cat, fp); if (rc) { pr_err("SELinux: mls: error reading low categories\n"); goto out; } if (items > 1) { rc = ebitmap_read(&r->level[1].cat, fp); if (rc) { pr_err("SELinux: mls: error reading high categories\n"); goto bad_high; } } else { rc = ebitmap_cpy(&r->level[1].cat, &r->level[0].cat); if (rc) { pr_err("SELinux: mls: out of memory\n"); goto bad_high; } } return 0; bad_high: ebitmap_destroy(&r->level[0].cat); out: return rc; } /* * Read and validate a security context structure * from a policydb binary representation file. */ static int context_read_and_validate(struct context *c, struct policydb *p, struct policy_file *fp) { __le32 buf[3]; int rc; rc = next_entry(buf, fp, sizeof buf); if (rc) { pr_err("SELinux: context truncated\n"); goto out; } c->user = le32_to_cpu(buf[0]); c->role = le32_to_cpu(buf[1]); c->type = le32_to_cpu(buf[2]); if (p->policyvers >= POLICYDB_VERSION_MLS) { rc = mls_read_range_helper(&c->range, fp); if (rc) { pr_err("SELinux: error reading MLS range of context\n"); goto out; } } rc = -EINVAL; if (!policydb_context_isvalid(p, c)) { pr_err("SELinux: invalid security context\n"); context_destroy(c); goto out; } rc = 0; out: return rc; } /* * The following *_read functions are used to * read the symbol data from a policy database * binary representation file. */ int str_read(char **strp, gfp_t flags, struct policy_file *fp, u32 len) { int rc; char *str; if ((len == 0) || (len == (u32)-1)) return -EINVAL; str = kmalloc(len + 1, flags | __GFP_NOWARN); if (!str) return -ENOMEM; rc = next_entry(str, fp, len); if (rc) { kfree(str); return rc; } str[len] = '\0'; *strp = str; return 0; } static int perm_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct perm_datum *perdatum; int rc; __le32 buf[2]; u32 len; perdatum = kzalloc(sizeof(*perdatum), GFP_KERNEL); if (!perdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); perdatum->value = le32_to_cpu(buf[1]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = symtab_insert(s, key, perdatum); if (rc) goto bad; return 0; bad: perm_destroy(key, perdatum, NULL); return rc; } static int common_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct common_datum *comdatum; __le32 buf[4]; u32 i, len, nel; int rc; comdatum = kzalloc(sizeof(*comdatum), GFP_KERNEL); if (!comdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); comdatum->value = le32_to_cpu(buf[1]); nel = le32_to_cpu(buf[3]); rc = symtab_init(&comdatum->permissions, nel); if (rc) goto bad; comdatum->permissions.nprim = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; for (i = 0; i < nel; i++) { rc = perm_read(p, &comdatum->permissions, fp); if (rc) goto bad; } hash_eval(&comdatum->permissions.table, "common_permissions", key); rc = symtab_insert(s, key, comdatum); if (rc) goto bad; return 0; bad: common_destroy(key, comdatum, NULL); return rc; } static void type_set_init(struct type_set *t) { ebitmap_init(&t->types); ebitmap_init(&t->negset); } static int type_set_read(struct type_set *t, struct policy_file *fp) { __le32 buf[1]; int rc; if (ebitmap_read(&t->types, fp)) return -EINVAL; if (ebitmap_read(&t->negset, fp)) return -EINVAL; rc = next_entry(buf, fp, sizeof(u32)); if (rc < 0) return -EINVAL; t->flags = le32_to_cpu(buf[0]); return 0; } static int read_cons_helper(struct policydb *p, struct constraint_node **nodep, u32 ncons, int allowxtarget, struct policy_file *fp) { struct constraint_node *c, *lc; struct constraint_expr *e, *le; __le32 buf[3]; u32 i, j, nexpr; int rc, depth; lc = NULL; for (i = 0; i < ncons; i++) { c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return -ENOMEM; if (lc) lc->next = c; else *nodep = c; rc = next_entry(buf, fp, (sizeof(u32) * 2)); if (rc) return rc; c->permissions = le32_to_cpu(buf[0]); nexpr = le32_to_cpu(buf[1]); le = NULL; depth = -1; for (j = 0; j < nexpr; j++) { e = kzalloc(sizeof(*e), GFP_KERNEL); if (!e) return -ENOMEM; if (le) le->next = e; else c->expr = e; rc = next_entry(buf, fp, (sizeof(u32) * 3)); if (rc) return rc; e->expr_type = le32_to_cpu(buf[0]); e->attr = le32_to_cpu(buf[1]); e->op = le32_to_cpu(buf[2]); switch (e->expr_type) { case CEXPR_NOT: if (depth < 0) return -EINVAL; break; case CEXPR_AND: case CEXPR_OR: if (depth < 1) return -EINVAL; depth--; break; case CEXPR_ATTR: if (depth == (CEXPR_MAXDEPTH - 1)) return -EINVAL; depth++; break; case CEXPR_NAMES: if (!allowxtarget && (e->attr & CEXPR_XTARGET)) return -EINVAL; if (depth == (CEXPR_MAXDEPTH - 1)) return -EINVAL; depth++; rc = ebitmap_read(&e->names, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_CONSTRAINT_NAMES) { e->type_names = kzalloc(sizeof(*e->type_names), GFP_KERNEL); if (!e->type_names) return -ENOMEM; type_set_init(e->type_names); rc = type_set_read(e->type_names, fp); if (rc) return rc; } break; default: return -EINVAL; } le = e; } if (depth != 0) return -EINVAL; lc = c; } return 0; } static int class_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct class_datum *cladatum; __le32 buf[6]; u32 i, len, len2, ncons, nel; int rc; cladatum = kzalloc(sizeof(*cladatum), GFP_KERNEL); if (!cladatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof(u32) * 6); if (rc) goto bad; len = le32_to_cpu(buf[0]); len2 = le32_to_cpu(buf[1]); cladatum->value = le32_to_cpu(buf[2]); nel = le32_to_cpu(buf[4]); rc = symtab_init(&cladatum->permissions, nel); if (rc) goto bad; cladatum->permissions.nprim = le32_to_cpu(buf[3]); ncons = le32_to_cpu(buf[5]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; if (len2) { rc = str_read(&cladatum->comkey, GFP_KERNEL, fp, len2); if (rc) goto bad; rc = -EINVAL; cladatum->comdatum = symtab_search(&p->p_commons, cladatum->comkey); if (!cladatum->comdatum) { pr_err("SELinux: unknown common %s\n", cladatum->comkey); goto bad; } } for (i = 0; i < nel; i++) { rc = perm_read(p, &cladatum->permissions, fp); if (rc) goto bad; } hash_eval(&cladatum->permissions.table, "class_permissions", key); rc = read_cons_helper(p, &cladatum->constraints, ncons, 0, fp); if (rc) goto bad; if (p->policyvers >= POLICYDB_VERSION_VALIDATETRANS) { /* grab the validatetrans rules */ rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; ncons = le32_to_cpu(buf[0]); rc = read_cons_helper(p, &cladatum->validatetrans, ncons, 1, fp); if (rc) goto bad; } if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) { rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto bad; cladatum->default_user = le32_to_cpu(buf[0]); cladatum->default_role = le32_to_cpu(buf[1]); cladatum->default_range = le32_to_cpu(buf[2]); } if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) { rc = next_entry(buf, fp, sizeof(u32) * 1); if (rc) goto bad; cladatum->default_type = le32_to_cpu(buf[0]); } rc = symtab_insert(s, key, cladatum); if (rc) goto bad; return 0; bad: cls_destroy(key, cladatum, NULL); return rc; } static int role_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct role_datum *role; int rc; unsigned int to_read = 2; __le32 buf[3]; u32 len; role = kzalloc(sizeof(*role), GFP_KERNEL); if (!role) return -ENOMEM; if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) to_read = 3; rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc) goto bad; len = le32_to_cpu(buf[0]); role->value = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) role->bounds = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = ebitmap_read(&role->dominates, fp); if (rc) goto bad; rc = ebitmap_read(&role->types, fp); if (rc) goto bad; if (strcmp(key, OBJECT_R) == 0) { rc = -EINVAL; if (role->value != OBJECT_R_VAL) { pr_err("SELinux: Role %s has wrong value %d\n", OBJECT_R, role->value); goto bad; } rc = 0; goto bad; } rc = symtab_insert(s, key, role); if (rc) goto bad; return 0; bad: role_destroy(key, role, NULL); return rc; } static int type_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct type_datum *typdatum; int rc; unsigned int to_read = 3; __le32 buf[4]; u32 len; typdatum = kzalloc(sizeof(*typdatum), GFP_KERNEL); if (!typdatum) return -ENOMEM; if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) to_read = 4; rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc) goto bad; len = le32_to_cpu(buf[0]); typdatum->value = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { u32 prop = le32_to_cpu(buf[2]); if (prop & TYPEDATUM_PROPERTY_PRIMARY) typdatum->primary = 1; if (prop & TYPEDATUM_PROPERTY_ATTRIBUTE) typdatum->attribute = 1; typdatum->bounds = le32_to_cpu(buf[3]); } else { typdatum->primary = le32_to_cpu(buf[2]); } rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = symtab_insert(s, key, typdatum); if (rc) goto bad; return 0; bad: type_destroy(key, typdatum, NULL); return rc; } /* * Read a MLS level structure from a policydb binary * representation file. */ static int mls_read_level(struct mls_level *lp, struct policy_file *fp) { __le32 buf[1]; int rc; memset(lp, 0, sizeof(*lp)); rc = next_entry(buf, fp, sizeof buf); if (rc) { pr_err("SELinux: mls: truncated level\n"); return rc; } lp->sens = le32_to_cpu(buf[0]); rc = ebitmap_read(&lp->cat, fp); if (rc) { pr_err("SELinux: mls: error reading level categories\n"); return rc; } return 0; } static int user_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct user_datum *usrdatum; int rc; unsigned int to_read = 2; __le32 buf[3]; u32 len; usrdatum = kzalloc(sizeof(*usrdatum), GFP_KERNEL); if (!usrdatum) return -ENOMEM; if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) to_read = 3; rc = next_entry(buf, fp, sizeof(buf[0]) * to_read); if (rc) goto bad; len = le32_to_cpu(buf[0]); usrdatum->value = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) usrdatum->bounds = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = ebitmap_read(&usrdatum->roles, fp); if (rc) goto bad; if (p->policyvers >= POLICYDB_VERSION_MLS) { rc = mls_read_range_helper(&usrdatum->range, fp); if (rc) goto bad; rc = mls_read_level(&usrdatum->dfltlevel, fp); if (rc) goto bad; } rc = symtab_insert(s, key, usrdatum); if (rc) goto bad; return 0; bad: user_destroy(key, usrdatum, NULL); return rc; } static int sens_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct level_datum *levdatum; int rc; __le32 buf[2]; u32 len; levdatum = kzalloc(sizeof(*levdatum), GFP_KERNEL); if (!levdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); levdatum->isalias = le32_to_cpu(buf[1]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = mls_read_level(&levdatum->level, fp); if (rc) goto bad; rc = symtab_insert(s, key, levdatum); if (rc) goto bad; return 0; bad: sens_destroy(key, levdatum, NULL); return rc; } static int cat_read(struct policydb *p, struct symtab *s, struct policy_file *fp) { char *key = NULL; struct cat_datum *catdatum; int rc; __le32 buf[3]; u32 len; catdatum = kzalloc(sizeof(*catdatum), GFP_KERNEL); if (!catdatum) return -ENOMEM; rc = next_entry(buf, fp, sizeof buf); if (rc) goto bad; len = le32_to_cpu(buf[0]); catdatum->value = le32_to_cpu(buf[1]); catdatum->isalias = le32_to_cpu(buf[2]); rc = str_read(&key, GFP_KERNEL, fp, len); if (rc) goto bad; rc = symtab_insert(s, key, catdatum); if (rc) goto bad; return 0; bad: cat_destroy(key, catdatum, NULL); return rc; } /* clang-format off */ static int (*const read_f[SYM_NUM])(struct policydb *p, struct symtab *s, struct policy_file *fp) = { common_read, class_read, role_read, type_read, user_read, cond_read_bool, sens_read, cat_read, }; /* clang-format on */ static int user_bounds_sanity_check(void *key, void *datum, void *datap) { struct user_datum *upper, *user; struct policydb *p = datap; int depth = 0; upper = user = datum; while (upper->bounds) { struct ebitmap_node *node; u32 bit; if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { pr_err("SELinux: user %s: " "too deep or looped boundary\n", (char *)key); return -EINVAL; } upper = p->user_val_to_struct[upper->bounds - 1]; ebitmap_for_each_positive_bit(&user->roles, node, bit) { if (ebitmap_get_bit(&upper->roles, bit)) continue; pr_err("SELinux: boundary violated policy: " "user=%s role=%s bounds=%s\n", sym_name(p, SYM_USERS, user->value - 1), sym_name(p, SYM_ROLES, bit), sym_name(p, SYM_USERS, upper->value - 1)); return -EINVAL; } } return 0; } static int role_bounds_sanity_check(void *key, void *datum, void *datap) { struct role_datum *upper, *role; struct policydb *p = datap; int depth = 0; upper = role = datum; while (upper->bounds) { struct ebitmap_node *node; u32 bit; if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { pr_err("SELinux: role %s: " "too deep or looped bounds\n", (char *)key); return -EINVAL; } upper = p->role_val_to_struct[upper->bounds - 1]; ebitmap_for_each_positive_bit(&role->types, node, bit) { if (ebitmap_get_bit(&upper->types, bit)) continue; pr_err("SELinux: boundary violated policy: " "role=%s type=%s bounds=%s\n", sym_name(p, SYM_ROLES, role->value - 1), sym_name(p, SYM_TYPES, bit), sym_name(p, SYM_ROLES, upper->value - 1)); return -EINVAL; } } return 0; } static int type_bounds_sanity_check(void *key, void *datum, void *datap) { struct type_datum *upper; struct policydb *p = datap; int depth = 0; upper = datum; while (upper->bounds) { if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { pr_err("SELinux: type %s: " "too deep or looped boundary\n", (char *)key); return -EINVAL; } upper = p->type_val_to_struct[upper->bounds - 1]; BUG_ON(!upper); if (upper->attribute) { pr_err("SELinux: type %s: " "bounded by attribute %s\n", (char *)key, sym_name(p, SYM_TYPES, upper->value - 1)); return -EINVAL; } } return 0; } static int policydb_bounds_sanity_check(struct policydb *p) { int rc; if (p->policyvers < POLICYDB_VERSION_BOUNDARY) return 0; rc = hashtab_map(&p->p_users.table, user_bounds_sanity_check, p); if (rc) return rc; rc = hashtab_map(&p->p_roles.table, role_bounds_sanity_check, p); if (rc) return rc; rc = hashtab_map(&p->p_types.table, type_bounds_sanity_check, p); if (rc) return rc; return 0; } u16 string_to_security_class(struct policydb *p, const char *name) { struct class_datum *cladatum; cladatum = symtab_search(&p->p_classes, name); if (!cladatum) return 0; return cladatum->value; } u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name) { struct class_datum *cladatum; struct perm_datum *perdatum = NULL; struct common_datum *comdatum; if (!tclass || tclass > p->p_classes.nprim) return 0; cladatum = p->class_val_to_struct[tclass - 1]; comdatum = cladatum->comdatum; if (comdatum) perdatum = symtab_search(&comdatum->permissions, name); if (!perdatum) perdatum = symtab_search(&cladatum->permissions, name); if (!perdatum) return 0; return 1U << (perdatum->value - 1); } static int range_read(struct policydb *p, struct policy_file *fp) { struct range_trans *rt = NULL; struct mls_range *r = NULL; int rc; __le32 buf[2]; u32 i, nel; if (p->policyvers < POLICYDB_VERSION_MLS) return 0; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; nel = le32_to_cpu(buf[0]); rc = hashtab_init(&p->range_tr, nel); if (rc) return rc; for (i = 0; i < nel; i++) { rc = -ENOMEM; rt = kzalloc(sizeof(*rt), GFP_KERNEL); if (!rt) goto out; rc = next_entry(buf, fp, (sizeof(u32) * 2)); if (rc) goto out; rt->source_type = le32_to_cpu(buf[0]); rt->target_type = le32_to_cpu(buf[1]); if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; rt->target_class = le32_to_cpu(buf[0]); } else rt->target_class = p->process_class; rc = -EINVAL; if (!policydb_type_isvalid(p, rt->source_type) || !policydb_type_isvalid(p, rt->target_type) || !policydb_class_isvalid(p, rt->target_class)) goto out; rc = -ENOMEM; r = kzalloc(sizeof(*r), GFP_KERNEL); if (!r) goto out; rc = mls_read_range_helper(r, fp); if (rc) goto out; rc = -EINVAL; if (!mls_range_isvalid(p, r)) { pr_warn("SELinux: rangetrans: invalid range\n"); goto out; } rc = hashtab_insert(&p->range_tr, rt, r, rangetr_key_params); if (rc) goto out; rt = NULL; r = NULL; } hash_eval(&p->range_tr, "rangetr", NULL); rc = 0; out: kfree(rt); kfree(r); return rc; } static int filename_trans_read_helper_compat(struct policydb *p, struct policy_file *fp) { struct filename_trans_key key, *ft = NULL; struct filename_trans_datum *last, *datum = NULL; char *name = NULL; u32 len, stype, otype; __le32 buf[4]; int rc; /* length of the path component string */ rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; len = le32_to_cpu(buf[0]); /* path component string */ rc = str_read(&name, GFP_KERNEL, fp, len); if (rc) return rc; rc = next_entry(buf, fp, sizeof(u32) * 4); if (rc) goto out; stype = le32_to_cpu(buf[0]); key.ttype = le32_to_cpu(buf[1]); key.tclass = le32_to_cpu(buf[2]); key.name = name; otype = le32_to_cpu(buf[3]); last = NULL; datum = policydb_filenametr_search(p, &key); while (datum) { if (unlikely(ebitmap_get_bit(&datum->stypes, stype - 1))) { /* conflicting/duplicate rules are ignored */ datum = NULL; rc = 0; goto out; } if (likely(datum->otype == otype)) break; last = datum; datum = datum->next; } if (!datum) { rc = -ENOMEM; datum = kmalloc(sizeof(*datum), GFP_KERNEL); if (!datum) goto out; ebitmap_init(&datum->stypes); datum->otype = otype; datum->next = NULL; if (unlikely(last)) { last->next = datum; } else { rc = -ENOMEM; ft = kmemdup(&key, sizeof(key), GFP_KERNEL); if (!ft) goto out; rc = hashtab_insert(&p->filename_trans, ft, datum, filenametr_key_params); if (rc) goto out; name = NULL; rc = ebitmap_set_bit(&p->filename_trans_ttypes, key.ttype, 1); if (rc) return rc; } } kfree(name); return ebitmap_set_bit(&datum->stypes, stype - 1, 1); out: kfree(ft); kfree(name); kfree(datum); return rc; } static int filename_trans_read_helper(struct policydb *p, struct policy_file *fp) { struct filename_trans_key *ft = NULL; struct filename_trans_datum **dst, *datum, *first = NULL; char *name = NULL; u32 len, ttype, tclass, ndatum, i; __le32 buf[3]; int rc; /* length of the path component string */ rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; len = le32_to_cpu(buf[0]); /* path component string */ rc = str_read(&name, GFP_KERNEL, fp, len); if (rc) return rc; rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto out; ttype = le32_to_cpu(buf[0]); tclass = le32_to_cpu(buf[1]); ndatum = le32_to_cpu(buf[2]); if (ndatum == 0) { pr_err("SELinux: Filename transition key with no datum\n"); rc = -ENOENT; goto out; } dst = &first; for (i = 0; i < ndatum; i++) { rc = -ENOMEM; datum = kmalloc(sizeof(*datum), GFP_KERNEL); if (!datum) goto out; datum->next = NULL; *dst = datum; /* ebitmap_read() will at least init the bitmap */ rc = ebitmap_read(&datum->stypes, fp); if (rc) goto out; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; datum->otype = le32_to_cpu(buf[0]); dst = &datum->next; } rc = -ENOMEM; ft = kmalloc(sizeof(*ft), GFP_KERNEL); if (!ft) goto out; ft->ttype = ttype; ft->tclass = tclass; ft->name = name; rc = hashtab_insert(&p->filename_trans, ft, first, filenametr_key_params); if (rc == -EEXIST) pr_err("SELinux: Duplicate filename transition key\n"); if (rc) goto out; return ebitmap_set_bit(&p->filename_trans_ttypes, ttype, 1); out: kfree(ft); kfree(name); while (first) { datum = first; first = first->next; ebitmap_destroy(&datum->stypes); kfree(datum); } return rc; } static int filename_trans_read(struct policydb *p, struct policy_file *fp) { u32 nel, i; __le32 buf[1]; int rc; if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS) return 0; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; nel = le32_to_cpu(buf[0]); if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) { p->compat_filename_trans_count = nel; rc = hashtab_init(&p->filename_trans, (1 << 11)); if (rc) return rc; for (i = 0; i < nel; i++) { rc = filename_trans_read_helper_compat(p, fp); if (rc) return rc; } } else { rc = hashtab_init(&p->filename_trans, nel); if (rc) return rc; for (i = 0; i < nel; i++) { rc = filename_trans_read_helper(p, fp); if (rc) return rc; } } hash_eval(&p->filename_trans, "filenametr", NULL); return 0; } static int genfs_read(struct policydb *p, struct policy_file *fp) { int rc; u32 i, j, nel, nel2, len, len2; __le32 buf[1]; struct ocontext *l, *c; struct ocontext *newc = NULL; struct genfs *genfs_p, *genfs; struct genfs *newgenfs = NULL; rc = next_entry(buf, fp, sizeof(u32)); if (rc) return rc; nel = le32_to_cpu(buf[0]); for (i = 0; i < nel; i++) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = -ENOMEM; newgenfs = kzalloc(sizeof(*newgenfs), GFP_KERNEL); if (!newgenfs) goto out; rc = str_read(&newgenfs->fstype, GFP_KERNEL, fp, len); if (rc) goto out; for (genfs_p = NULL, genfs = p->genfs; genfs; genfs_p = genfs, genfs = genfs->next) { rc = -EINVAL; if (strcmp(newgenfs->fstype, genfs->fstype) == 0) { pr_err("SELinux: dup genfs fstype %s\n", newgenfs->fstype); goto out; } if (strcmp(newgenfs->fstype, genfs->fstype) < 0) break; } newgenfs->next = genfs; if (genfs_p) genfs_p->next = newgenfs; else p->genfs = newgenfs; genfs = newgenfs; newgenfs = NULL; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; nel2 = le32_to_cpu(buf[0]); for (j = 0; j < nel2; j++) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = -ENOMEM; newc = kzalloc(sizeof(*newc), GFP_KERNEL); if (!newc) goto out; rc = str_read(&newc->u.name, GFP_KERNEL, fp, len); if (rc) goto out; rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; newc->v.sclass = le32_to_cpu(buf[0]); rc = context_read_and_validate(&newc->context[0], p, fp); if (rc) goto out; for (l = NULL, c = genfs->head; c; l = c, c = c->next) { rc = -EINVAL; if (!strcmp(newc->u.name, c->u.name) && (!c->v.sclass || !newc->v.sclass || newc->v.sclass == c->v.sclass)) { pr_err("SELinux: dup genfs entry (%s,%s)\n", genfs->fstype, c->u.name); goto out; } len = strlen(newc->u.name); len2 = strlen(c->u.name); if (len > len2) break; } newc->next = c; if (l) l->next = newc; else genfs->head = newc; newc = NULL; } } rc = 0; out: if (newgenfs) { kfree(newgenfs->fstype); kfree(newgenfs); } ocontext_destroy(newc, OCON_FSUSE); return rc; } static int ocontext_read(struct policydb *p, const struct policydb_compat_info *info, struct policy_file *fp) { int rc; unsigned int i; u32 j, nel, len; __be64 prefixbuf[1]; __le32 buf[3]; struct ocontext *l, *c; u32 nodebuf[8]; for (i = 0; i < info->ocon_num; i++) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; nel = le32_to_cpu(buf[0]); l = NULL; for (j = 0; j < nel; j++) { rc = -ENOMEM; c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) goto out; if (l) l->next = c; else p->ocontexts[i] = c; l = c; switch (i) { case OCON_ISID: rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; c->sid[0] = le32_to_cpu(buf[0]); rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_FS: case OCON_NETIF: rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = str_read(&c->u.name, GFP_KERNEL, fp, len); if (rc) goto out; if (i == OCON_FS) pr_warn("SELinux: void and deprecated fs ocon %s\n", c->u.name); rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; rc = context_read_and_validate(&c->context[1], p, fp); if (rc) goto out; break; case OCON_PORT: rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto out; c->u.port.protocol = le32_to_cpu(buf[0]); c->u.port.low_port = le32_to_cpu(buf[1]); c->u.port.high_port = le32_to_cpu(buf[2]); rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_NODE: rc = next_entry(nodebuf, fp, sizeof(u32) * 2); if (rc) goto out; c->u.node.addr = nodebuf[0]; /* network order */ c->u.node.mask = nodebuf[1]; /* network order */ rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_FSUSE: rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto out; rc = -EINVAL; c->v.behavior = le32_to_cpu(buf[0]); /* Determined at runtime, not in policy DB. */ if (c->v.behavior == SECURITY_FS_USE_MNTPOINT) goto out; if (c->v.behavior > SECURITY_FS_USE_MAX) goto out; len = le32_to_cpu(buf[1]); rc = str_read(&c->u.name, GFP_KERNEL, fp, len); if (rc) goto out; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; case OCON_NODE6: { int k; rc = next_entry(nodebuf, fp, sizeof(u32) * 8); if (rc) goto out; for (k = 0; k < 4; k++) c->u.node6.addr[k] = nodebuf[k]; for (k = 0; k < 4; k++) c->u.node6.mask[k] = nodebuf[k + 4]; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; } case OCON_IBPKEY: { u32 pkey_lo, pkey_hi; rc = next_entry(prefixbuf, fp, sizeof(u64)); if (rc) goto out; /* we need to have subnet_prefix in CPU order */ c->u.ibpkey.subnet_prefix = be64_to_cpu(prefixbuf[0]); rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto out; pkey_lo = le32_to_cpu(buf[0]); pkey_hi = le32_to_cpu(buf[1]); if (pkey_lo > U16_MAX || pkey_hi > U16_MAX) { rc = -EINVAL; goto out; } c->u.ibpkey.low_pkey = pkey_lo; c->u.ibpkey.high_pkey = pkey_hi; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; } case OCON_IBENDPORT: { u32 port; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto out; len = le32_to_cpu(buf[0]); rc = str_read(&c->u.ibendport.dev_name, GFP_KERNEL, fp, len); if (rc) goto out; port = le32_to_cpu(buf[1]); if (port > U8_MAX || port == 0) { rc = -EINVAL; goto out; } c->u.ibendport.port = port; rc = context_read_and_validate(&c->context[0], p, fp); if (rc) goto out; break; } /* end case */ } /* end switch */ } } rc = 0; out: return rc; } /* * Read the configuration data from a policy database binary * representation file into a policy database structure. */ int policydb_read(struct policydb *p, struct policy_file *fp) { struct role_allow *ra, *lra; struct role_trans_key *rtk = NULL; struct role_trans_datum *rtd = NULL; int rc; __le32 buf[4]; u32 i, j, len, nprim, nel, perm; char *policydb_str; const struct policydb_compat_info *info; policydb_init(p); /* Read the magic number and string length. */ rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto bad; rc = -EINVAL; if (le32_to_cpu(buf[0]) != POLICYDB_MAGIC) { pr_err("SELinux: policydb magic number 0x%x does " "not match expected magic number 0x%x\n", le32_to_cpu(buf[0]), POLICYDB_MAGIC); goto bad; } rc = -EINVAL; len = le32_to_cpu(buf[1]); if (len != strlen(POLICYDB_STRING)) { pr_err("SELinux: policydb string length %d does not " "match expected length %zu\n", len, strlen(POLICYDB_STRING)); goto bad; } rc = str_read(&policydb_str, GFP_KERNEL, fp, len); if (rc) { if (rc == -ENOMEM) { pr_err("SELinux: unable to allocate memory for policydb string of length %d\n", len); } else { pr_err("SELinux: truncated policydb string identifier\n"); } goto bad; } rc = -EINVAL; if (strcmp(policydb_str, POLICYDB_STRING)) { pr_err("SELinux: policydb string %s does not match " "my string %s\n", policydb_str, POLICYDB_STRING); kfree(policydb_str); goto bad; } /* Done with policydb_str. */ kfree(policydb_str); policydb_str = NULL; /* Read the version and table sizes. */ rc = next_entry(buf, fp, sizeof(u32) * 4); if (rc) goto bad; rc = -EINVAL; p->policyvers = le32_to_cpu(buf[0]); if (p->policyvers < POLICYDB_VERSION_MIN || p->policyvers > POLICYDB_VERSION_MAX) { pr_err("SELinux: policydb version %d does not match " "my version range %d-%d\n", le32_to_cpu(buf[0]), POLICYDB_VERSION_MIN, POLICYDB_VERSION_MAX); goto bad; } if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_MLS)) { p->mls_enabled = 1; rc = -EINVAL; if (p->policyvers < POLICYDB_VERSION_MLS) { pr_err("SELinux: security policydb version %d " "(MLS) not backwards compatible\n", p->policyvers); goto bad; } } p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN); p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN); if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_read(&p->policycaps, fp); if (rc) goto bad; } if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) { rc = ebitmap_read(&p->permissive_map, fp); if (rc) goto bad; } rc = -EINVAL; info = policydb_lookup_compat(p->policyvers); if (!info) { pr_err("SELinux: unable to find policy compat info " "for version %d\n", p->policyvers); goto bad; } rc = -EINVAL; if (le32_to_cpu(buf[2]) != info->sym_num || le32_to_cpu(buf[3]) != info->ocon_num) { pr_err("SELinux: policydb table sizes (%d,%d) do " "not match mine (%d,%d)\n", le32_to_cpu(buf[2]), le32_to_cpu(buf[3]), info->sym_num, info->ocon_num); goto bad; } for (i = 0; i < info->sym_num; i++) { rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto bad; nprim = le32_to_cpu(buf[0]); nel = le32_to_cpu(buf[1]); rc = symtab_init(&p->symtab[i], nel); if (rc) goto out; if (i == SYM_ROLES) { rc = roles_init(p); if (rc) goto out; } for (j = 0; j < nel; j++) { rc = read_f[i](p, &p->symtab[i], fp); if (rc) goto bad; } p->symtab[i].nprim = nprim; } rc = -EINVAL; p->process_class = string_to_security_class(p, "process"); if (!p->process_class) { pr_err("SELinux: process class is required, not defined in policy\n"); goto bad; } rc = avtab_read(&p->te_avtab, fp, p); if (rc) goto bad; if (p->policyvers >= POLICYDB_VERSION_BOOL) { rc = cond_read_list(p, fp); if (rc) goto bad; } rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; nel = le32_to_cpu(buf[0]); rc = hashtab_init(&p->role_tr, nel); if (rc) goto bad; for (i = 0; i < nel; i++) { rc = -ENOMEM; rtk = kmalloc(sizeof(*rtk), GFP_KERNEL); if (!rtk) goto bad; rc = -ENOMEM; rtd = kmalloc(sizeof(*rtd), GFP_KERNEL); if (!rtd) goto bad; rc = next_entry(buf, fp, sizeof(u32) * 3); if (rc) goto bad; rtk->role = le32_to_cpu(buf[0]); rtk->type = le32_to_cpu(buf[1]); rtd->new_role = le32_to_cpu(buf[2]); if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) { rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; rtk->tclass = le32_to_cpu(buf[0]); } else rtk->tclass = p->process_class; rc = -EINVAL; if (!policydb_role_isvalid(p, rtk->role) || !policydb_type_isvalid(p, rtk->type) || !policydb_class_isvalid(p, rtk->tclass) || !policydb_role_isvalid(p, rtd->new_role)) goto bad; rc = hashtab_insert(&p->role_tr, rtk, rtd, roletr_key_params); if (rc) goto bad; rtk = NULL; rtd = NULL; } hash_eval(&p->role_tr, "roletr", NULL); rc = next_entry(buf, fp, sizeof(u32)); if (rc) goto bad; nel = le32_to_cpu(buf[0]); lra = NULL; for (i = 0; i < nel; i++) { rc = -ENOMEM; ra = kzalloc(sizeof(*ra), GFP_KERNEL); if (!ra) goto bad; if (lra) lra->next = ra; else p->role_allow = ra; rc = next_entry(buf, fp, sizeof(u32) * 2); if (rc) goto bad; rc = -EINVAL; ra->role = le32_to_cpu(buf[0]); ra->new_role = le32_to_cpu(buf[1]); if (!policydb_role_isvalid(p, ra->role) || !policydb_role_isvalid(p, ra->new_role)) goto bad; lra = ra; } rc = filename_trans_read(p, fp); if (rc) goto bad; rc = policydb_index(p); if (rc) goto bad; rc = -EINVAL; perm = string_to_av_perm(p, p->process_class, "transition"); if (!perm) { pr_err("SELinux: process transition permission is required, not defined in policy\n"); goto bad; } p->process_trans_perms = perm; perm = string_to_av_perm(p, p->process_class, "dyntransition"); if (!perm) { pr_err("SELinux: process dyntransition permission is required, not defined in policy\n"); goto bad; } p->process_trans_perms |= perm; rc = ocontext_read(p, info, fp); if (rc) goto bad; rc = genfs_read(p, fp); if (rc) goto bad; rc = range_read(p, fp); if (rc) goto bad; rc = -ENOMEM; p->type_attr_map_array = kvcalloc( p->p_types.nprim, sizeof(*p->type_attr_map_array), GFP_KERNEL); if (!p->type_attr_map_array) goto bad; /* just in case ebitmap_init() becomes more than just a memset(0): */ for (i = 0; i < p->p_types.nprim; i++) ebitmap_init(&p->type_attr_map_array[i]); for (i = 0; i < p->p_types.nprim; i++) { struct ebitmap *e = &p->type_attr_map_array[i]; if (p->policyvers >= POLICYDB_VERSION_AVTAB) { rc = ebitmap_read(e, fp); if (rc) goto bad; } /* add the type itself as the degenerate case */ rc = ebitmap_set_bit(e, i, 1); if (rc) goto bad; } rc = policydb_bounds_sanity_check(p); if (rc) goto bad; rc = 0; out: return rc; bad: kfree(rtk); kfree(rtd); policydb_destroy(p); goto out; } /* * Write a MLS level structure to a policydb binary * representation file. */ static int mls_write_level(struct mls_level *l, struct policy_file *fp) { __le32 buf[1]; int rc; buf[0] = cpu_to_le32(l->sens); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = ebitmap_write(&l->cat, fp); if (rc) return rc; return 0; } /* * Write a MLS range structure to a policydb binary * representation file. */ static int mls_write_range_helper(struct mls_range *r, struct policy_file *fp) { __le32 buf[3]; size_t items; int rc, eq; eq = mls_level_eq(&r->level[1], &r->level[0]); if (eq) items = 2; else items = 3; buf[0] = cpu_to_le32(items - 1); buf[1] = cpu_to_le32(r->level[0].sens); if (!eq) buf[2] = cpu_to_le32(r->level[1].sens); BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = ebitmap_write(&r->level[0].cat, fp); if (rc) return rc; if (!eq) { rc = ebitmap_write(&r->level[1].cat, fp); if (rc) return rc; } return 0; } static int sens_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct level_datum *levdatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[2]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(levdatum->isalias); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = mls_write_level(&levdatum->level, fp); if (rc) return rc; return 0; } static int cat_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct cat_datum *catdatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[3]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(catdatum->value); buf[2] = cpu_to_le32(catdatum->isalias); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } static int role_trans_write_one(void *key, void *datum, void *ptr) { struct role_trans_key *rtk = key; struct role_trans_datum *rtd = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; __le32 buf[3]; int rc; buf[0] = cpu_to_le32(rtk->role); buf[1] = cpu_to_le32(rtk->type); buf[2] = cpu_to_le32(rtd->new_role); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) { buf[0] = cpu_to_le32(rtk->tclass); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; } return 0; } static int role_trans_write(struct policydb *p, struct policy_file *fp) { struct policy_data pd = { .p = p, .fp = fp }; __le32 buf[1]; int rc; buf[0] = cpu_to_le32(p->role_tr.nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; return hashtab_map(&p->role_tr, role_trans_write_one, &pd); } static int role_allow_write(struct role_allow *r, struct policy_file *fp) { struct role_allow *ra; __le32 buf[2]; size_t nel; int rc; nel = 0; for (ra = r; ra; ra = ra->next) nel++; buf[0] = cpu_to_le32(nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (ra = r; ra; ra = ra->next) { buf[0] = cpu_to_le32(ra->role); buf[1] = cpu_to_le32(ra->new_role); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; } return 0; } /* * Write a security context structure * to a policydb binary representation file. */ static int context_write(struct policydb *p, struct context *c, struct policy_file *fp) { int rc; __le32 buf[3]; buf[0] = cpu_to_le32(c->user); buf[1] = cpu_to_le32(c->role); buf[2] = cpu_to_le32(c->type); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = mls_write_range_helper(&c->range, fp); if (rc) return rc; return 0; } /* * The following *_write functions are used to * write the symbol data to a policy database * binary representation file. */ static int perm_write(void *vkey, void *datum, void *fp) { char *key = vkey; struct perm_datum *perdatum = datum; __le32 buf[2]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(perdatum->value); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } static int common_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct common_datum *comdatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; __le32 buf[4]; size_t len; int rc; len = strlen(key); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(comdatum->value); buf[2] = cpu_to_le32(comdatum->permissions.nprim); buf[3] = cpu_to_le32(comdatum->permissions.table.nel); rc = put_entry(buf, sizeof(u32), 4, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = hashtab_map(&comdatum->permissions.table, perm_write, fp); if (rc) return rc; return 0; } static int type_set_write(struct type_set *t, struct policy_file *fp) { int rc; __le32 buf[1]; if (ebitmap_write(&t->types, fp)) return -EINVAL; if (ebitmap_write(&t->negset, fp)) return -EINVAL; buf[0] = cpu_to_le32(t->flags); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return -EINVAL; return 0; } static int write_cons_helper(struct policydb *p, struct constraint_node *node, struct policy_file *fp) { struct constraint_node *c; struct constraint_expr *e; __le32 buf[3]; u32 nel; int rc; for (c = node; c; c = c->next) { nel = 0; for (e = c->expr; e; e = e->next) nel++; buf[0] = cpu_to_le32(c->permissions); buf[1] = cpu_to_le32(nel); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; for (e = c->expr; e; e = e->next) { buf[0] = cpu_to_le32(e->expr_type); buf[1] = cpu_to_le32(e->attr); buf[2] = cpu_to_le32(e->op); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; switch (e->expr_type) { case CEXPR_NAMES: rc = ebitmap_write(&e->names, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_CONSTRAINT_NAMES) { rc = type_set_write(e->type_names, fp); if (rc) return rc; } break; default: break; } } } return 0; } static int class_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct class_datum *cladatum = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; struct constraint_node *c; __le32 buf[6]; u32 ncons; size_t len, len2; int rc; len = strlen(key); if (cladatum->comkey) len2 = strlen(cladatum->comkey); else len2 = 0; ncons = 0; for (c = cladatum->constraints; c; c = c->next) ncons++; buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(len2); buf[2] = cpu_to_le32(cladatum->value); buf[3] = cpu_to_le32(cladatum->permissions.nprim); buf[4] = cpu_to_le32(cladatum->permissions.table.nel); buf[5] = cpu_to_le32(ncons); rc = put_entry(buf, sizeof(u32), 6, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; if (cladatum->comkey) { rc = put_entry(cladatum->comkey, 1, len2, fp); if (rc) return rc; } rc = hashtab_map(&cladatum->permissions.table, perm_write, fp); if (rc) return rc; rc = write_cons_helper(p, cladatum->constraints, fp); if (rc) return rc; /* write out the validatetrans rule */ ncons = 0; for (c = cladatum->validatetrans; c; c = c->next) ncons++; buf[0] = cpu_to_le32(ncons); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = write_cons_helper(p, cladatum->validatetrans, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) { buf[0] = cpu_to_le32(cladatum->default_user); buf[1] = cpu_to_le32(cladatum->default_role); buf[2] = cpu_to_le32(cladatum->default_range); rc = put_entry(buf, sizeof(uint32_t), 3, fp); if (rc) return rc; } if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) { buf[0] = cpu_to_le32(cladatum->default_type); rc = put_entry(buf, sizeof(uint32_t), 1, fp); if (rc) return rc; } return 0; } static int role_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct role_datum *role = datum; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; __le32 buf[3]; size_t items, len; int rc; len = strlen(key); items = 0; buf[items++] = cpu_to_le32(len); buf[items++] = cpu_to_le32(role->value); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) buf[items++] = cpu_to_le32(role->bounds); BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = ebitmap_write(&role->dominates, fp); if (rc) return rc; rc = ebitmap_write(&role->types, fp); if (rc) return rc; return 0; } static int type_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct type_datum *typdatum = datum; struct policy_data *pd = ptr; struct policydb *p = pd->p; struct policy_file *fp = pd->fp; __le32 buf[4]; int rc; size_t items, len; len = strlen(key); items = 0; buf[items++] = cpu_to_le32(len); buf[items++] = cpu_to_le32(typdatum->value); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) { u32 properties = 0; if (typdatum->primary) properties |= TYPEDATUM_PROPERTY_PRIMARY; if (typdatum->attribute) properties |= TYPEDATUM_PROPERTY_ATTRIBUTE; buf[items++] = cpu_to_le32(properties); buf[items++] = cpu_to_le32(typdatum->bounds); } else { buf[items++] = cpu_to_le32(typdatum->primary); } BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; return 0; } static int user_write(void *vkey, void *datum, void *ptr) { char *key = vkey; struct user_datum *usrdatum = datum; struct policy_data *pd = ptr; struct policydb *p = pd->p; struct policy_file *fp = pd->fp; __le32 buf[3]; size_t items, len; int rc; len = strlen(key); items = 0; buf[items++] = cpu_to_le32(len); buf[items++] = cpu_to_le32(usrdatum->value); if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) buf[items++] = cpu_to_le32(usrdatum->bounds); BUG_ON(items > ARRAY_SIZE(buf)); rc = put_entry(buf, sizeof(u32), items, fp); if (rc) return rc; rc = put_entry(key, 1, len, fp); if (rc) return rc; rc = ebitmap_write(&usrdatum->roles, fp); if (rc) return rc; rc = mls_write_range_helper(&usrdatum->range, fp); if (rc) return rc; rc = mls_write_level(&usrdatum->dfltlevel, fp); if (rc) return rc; return 0; } /* clang-format off */ static int (*const write_f[SYM_NUM])(void *key, void *datum, void *datap) = { common_write, class_write, role_write, type_write, user_write, cond_write_bool, sens_write, cat_write, }; /* clang-format on */ static int ocontext_write(struct policydb *p, const struct policydb_compat_info *info, struct policy_file *fp) { unsigned int i, j; int rc; size_t nel, len; __be64 prefixbuf[1]; __le32 buf[3]; u32 nodebuf[8]; struct ocontext *c; for (i = 0; i < info->ocon_num; i++) { nel = 0; for (c = p->ocontexts[i]; c; c = c->next) nel++; buf[0] = cpu_to_le32(nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (c = p->ocontexts[i]; c; c = c->next) { switch (i) { case OCON_ISID: buf[0] = cpu_to_le32(c->sid[0]); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_FS: case OCON_NETIF: len = strlen(c->u.name); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(c->u.name, 1, len, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; rc = context_write(p, &c->context[1], fp); if (rc) return rc; break; case OCON_PORT: buf[0] = cpu_to_le32(c->u.port.protocol); buf[1] = cpu_to_le32(c->u.port.low_port); buf[2] = cpu_to_le32(c->u.port.high_port); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_NODE: nodebuf[0] = c->u.node.addr; /* network order */ nodebuf[1] = c->u.node.mask; /* network order */ rc = put_entry(nodebuf, sizeof(u32), 2, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_FSUSE: buf[0] = cpu_to_le32(c->v.behavior); len = strlen(c->u.name); buf[1] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(c->u.name, 1, len, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_NODE6: for (j = 0; j < 4; j++) nodebuf[j] = c->u.node6.addr [j]; /* network order */ for (j = 0; j < 4; j++) nodebuf[j + 4] = c->u.node6.mask [j]; /* network order */ rc = put_entry(nodebuf, sizeof(u32), 8, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_IBPKEY: /* subnet_prefix is in CPU order */ prefixbuf[0] = cpu_to_be64(c->u.ibpkey.subnet_prefix); rc = put_entry(prefixbuf, sizeof(u64), 1, fp); if (rc) return rc; buf[0] = cpu_to_le32(c->u.ibpkey.low_pkey); buf[1] = cpu_to_le32(c->u.ibpkey.high_pkey); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; case OCON_IBENDPORT: len = strlen(c->u.ibendport.dev_name); buf[0] = cpu_to_le32(len); buf[1] = cpu_to_le32(c->u.ibendport.port); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(c->u.ibendport.dev_name, 1, len, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; break; } } } return 0; } static int genfs_write(struct policydb *p, struct policy_file *fp) { struct genfs *genfs; struct ocontext *c; size_t len; __le32 buf[1]; int rc; len = 0; for (genfs = p->genfs; genfs; genfs = genfs->next) len++; buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (genfs = p->genfs; genfs; genfs = genfs->next) { len = strlen(genfs->fstype); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(genfs->fstype, 1, len, fp); if (rc) return rc; len = 0; for (c = genfs->head; c; c = c->next) len++; buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (c = genfs->head; c; c = c->next) { len = strlen(c->u.name); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(c->u.name, 1, len, fp); if (rc) return rc; buf[0] = cpu_to_le32(c->v.sclass); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = context_write(p, &c->context[0], fp); if (rc) return rc; } } return 0; } static int range_write_helper(void *key, void *data, void *ptr) { __le32 buf[2]; struct range_trans *rt = key; struct mls_range *r = data; struct policy_data *pd = ptr; struct policy_file *fp = pd->fp; struct policydb *p = pd->p; int rc; buf[0] = cpu_to_le32(rt->source_type); buf[1] = cpu_to_le32(rt->target_type); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) { buf[0] = cpu_to_le32(rt->target_class); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; } rc = mls_write_range_helper(r, fp); if (rc) return rc; return 0; } static int range_write(struct policydb *p, struct policy_file *fp) { __le32 buf[1]; int rc; struct policy_data pd; pd.p = p; pd.fp = fp; buf[0] = cpu_to_le32(p->range_tr.nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; /* actually write all of the entries */ rc = hashtab_map(&p->range_tr, range_write_helper, &pd); if (rc) return rc; return 0; } static int filename_write_helper_compat(void *key, void *data, void *ptr) { struct filename_trans_key *ft = key; struct filename_trans_datum *datum = data; struct ebitmap_node *node; struct policy_file *fp = ptr; __le32 buf[4]; int rc; u32 bit, len = strlen(ft->name); do { ebitmap_for_each_positive_bit(&datum->stypes, node, bit) { buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(ft->name, sizeof(char), len, fp); if (rc) return rc; buf[0] = cpu_to_le32(bit + 1); buf[1] = cpu_to_le32(ft->ttype); buf[2] = cpu_to_le32(ft->tclass); buf[3] = cpu_to_le32(datum->otype); rc = put_entry(buf, sizeof(u32), 4, fp); if (rc) return rc; } datum = datum->next; } while (unlikely(datum)); return 0; } static int filename_write_helper(void *key, void *data, void *ptr) { struct filename_trans_key *ft = key; struct filename_trans_datum *datum; struct policy_file *fp = ptr; __le32 buf[3]; int rc; u32 ndatum, len = strlen(ft->name); buf[0] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = put_entry(ft->name, sizeof(char), len, fp); if (rc) return rc; ndatum = 0; datum = data; do { ndatum++; datum = datum->next; } while (unlikely(datum)); buf[0] = cpu_to_le32(ft->ttype); buf[1] = cpu_to_le32(ft->tclass); buf[2] = cpu_to_le32(ndatum); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; datum = data; do { rc = ebitmap_write(&datum->stypes, fp); if (rc) return rc; buf[0] = cpu_to_le32(datum->otype); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; datum = datum->next; } while (unlikely(datum)); return 0; } static int filename_trans_write(struct policydb *p, struct policy_file *fp) { __le32 buf[1]; int rc; if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS) return 0; if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) { buf[0] = cpu_to_le32(p->compat_filename_trans_count); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = hashtab_map(&p->filename_trans, filename_write_helper_compat, fp); } else { buf[0] = cpu_to_le32(p->filename_trans.nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; rc = hashtab_map(&p->filename_trans, filename_write_helper, fp); } return rc; } /* * Write the configuration data in a policy database * structure to a policy database binary representation * file. */ int policydb_write(struct policydb *p, struct policy_file *fp) { unsigned int num_syms; int rc; __le32 buf[4]; u32 config, i; size_t len; const struct policydb_compat_info *info; /* * refuse to write policy older than compressed avtab * to simplify the writer. There are other tests dropped * since we assume this throughout the writer code. Be * careful if you ever try to remove this restriction */ if (p->policyvers < POLICYDB_VERSION_AVTAB) { pr_err("SELinux: refusing to write policy version %d." " Because it is less than version %d\n", p->policyvers, POLICYDB_VERSION_AVTAB); return -EINVAL; } config = 0; if (p->mls_enabled) config |= POLICYDB_CONFIG_MLS; if (p->reject_unknown) config |= REJECT_UNKNOWN; if (p->allow_unknown) config |= ALLOW_UNKNOWN; /* Write the magic number and string identifiers. */ buf[0] = cpu_to_le32(POLICYDB_MAGIC); len = strlen(POLICYDB_STRING); buf[1] = cpu_to_le32(len); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = put_entry(POLICYDB_STRING, 1, len, fp); if (rc) return rc; /* Write the version, config, and table sizes. */ info = policydb_lookup_compat(p->policyvers); if (!info) { pr_err("SELinux: compatibility lookup failed for policy " "version %d\n", p->policyvers); return -EINVAL; } buf[0] = cpu_to_le32(p->policyvers); buf[1] = cpu_to_le32(config); buf[2] = cpu_to_le32(info->sym_num); buf[3] = cpu_to_le32(info->ocon_num); rc = put_entry(buf, sizeof(u32), 4, fp); if (rc) return rc; if (p->policyvers >= POLICYDB_VERSION_POLCAP) { rc = ebitmap_write(&p->policycaps, fp); if (rc) return rc; } if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) { rc = ebitmap_write(&p->permissive_map, fp); if (rc) return rc; } num_syms = info->sym_num; for (i = 0; i < num_syms; i++) { struct policy_data pd; pd.fp = fp; pd.p = p; buf[0] = cpu_to_le32(p->symtab[i].nprim); buf[1] = cpu_to_le32(p->symtab[i].table.nel); rc = put_entry(buf, sizeof(u32), 2, fp); if (rc) return rc; rc = hashtab_map(&p->symtab[i].table, write_f[i], &pd); if (rc) return rc; } rc = avtab_write(p, &p->te_avtab, fp); if (rc) return rc; rc = cond_write_list(p, fp); if (rc) return rc; rc = role_trans_write(p, fp); if (rc) return rc; rc = role_allow_write(p->role_allow, fp); if (rc) return rc; rc = filename_trans_write(p, fp); if (rc) return rc; rc = ocontext_write(p, info, fp); if (rc) return rc; rc = genfs_write(p, fp); if (rc) return rc; rc = range_write(p, fp); if (rc) return rc; for (i = 0; i < p->p_types.nprim; i++) { struct ebitmap *e = &p->type_attr_map_array[i]; rc = ebitmap_write(e, fp); if (rc) return rc; } return 0; }
89 142 127 2 14 1 2 124 13 1 9 127 1 57 77 2 86 47 1 63 69 1 126 49 77 79 46 42 57 68 131 131 6 2 25 122 4 118 4 1 121 297 347 4 2 345 306 52 52 342 19 19 7 19 19 348 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0-only /* * VLAN netlink control interface * * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include "vlan.h" static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = { [IFLA_VLAN_ID] = { .type = NLA_U16 }, [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) }, [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_PROTOCOL] = { .type = NLA_U16 }, }; static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = { [IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) }, }; static inline int vlan_validate_qos_map(struct nlattr *attr) { if (!attr) return 0; return nla_validate_nested_deprecated(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy, NULL); } static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; u16 id; int err; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EADDRNOTAVAIL; } } if (!data) { NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified"); return -EINVAL; } if (data[IFLA_VLAN_PROTOCOL]) { switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): break; default: NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol"); return -EPROTONOSUPPORT; } } if (data[IFLA_VLAN_ID]) { id = nla_get_u16(data[IFLA_VLAN_ID]); if (id >= VLAN_VID_MASK) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id"); return -ERANGE; } } if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; } } err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map"); return err; } err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map"); return err; } return 0; } static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; int rem, err; if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); err = vlan_dev_change_flags(dev, flags->flags, flags->mask); if (err) return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, data[IFLA_VLAN_INGRESS_QOS], rem) { m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) return err; } } return 0; } static int vlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct net_device *real_dev; unsigned int max_mtu; __be16 proto; int err; if (!data[IFLA_VLAN_ID]) { NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified"); return -EINVAL; } if (!tb[IFLA_LINK]) { NL_SET_ERR_MSG_MOD(extack, "link not specified"); return -EINVAL; } real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; } proto = nla_get_be16_default(data[IFLA_VLAN_PROTOCOL], htons(ETH_P_8021Q)); vlan->vlan_proto = proto; vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); vlan->real_dev = real_dev; dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlan->flags = VLAN_FLAG_REORDER_HDR; err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id, extack); if (err < 0) return err; max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN : real_dev->mtu; if (!tb[IFLA_MTU]) dev->mtu = max_mtu; else if (dev->mtu > max_mtu) return -EINVAL; /* Note: If this initial vlan_changelink() fails, we need * to call vlan_dev_free_egress_priority() to free memory. */ err = vlan_changelink(dev, tb, data, extack); if (!err) err = register_vlan_dev(dev, extack); if (err) vlan_dev_free_egress_priority(dev); return err; } static inline size_t vlan_qos_map_size(unsigned int n) { if (n == 0) return 0; /* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */ return nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n; } static size_t vlan_get_size(const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return nla_total_size(2) + /* IFLA_VLAN_PROTOCOL */ nla_total_size(2) + /* IFLA_VLAN_ID */ nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */ vlan_qos_map_size(vlan->nr_ingress_mappings) + vlan_qos_map_size(vlan->nr_egress_mappings); } static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *pm; struct ifla_vlan_flags f; struct ifla_vlan_qos_mapping m; struct nlattr *nest; unsigned int i; if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) || nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id)) goto nla_put_failure; if (vlan->flags) { f.flags = vlan->flags; f.mask = ~0; if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f)) goto nla_put_failure; } if (vlan->nr_ingress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_INGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) { if (!vlan->ingress_priority_map[i]) continue; m.from = i; m.to = vlan->ingress_priority_map[i]; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (vlan->nr_egress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_EGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { for (pm = vlan->egress_priority_map[i]; pm; pm = pm->next) { if (!pm->vlan_qos) continue; m.from = pm->priority; m.to = (pm->vlan_qos >> 13) & 0x7; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } } nla_nest_end(skb, nest); } return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vlan_get_link_net(const struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return dev_net(real_dev); } struct rtnl_link_ops vlan_link_ops __read_mostly = { .kind = "vlan", .maxtype = IFLA_VLAN_MAX, .policy = vlan_policy, .priv_size = sizeof(struct vlan_dev_priv), .setup = vlan_setup, .validate = vlan_validate, .newlink = vlan_newlink, .changelink = vlan_changelink, .dellink = unregister_vlan_dev, .get_size = vlan_get_size, .fill_info = vlan_fill_info, .get_link_net = vlan_get_link_net, }; int __init vlan_netlink_init(void) { return rtnl_link_register(&vlan_link_ops); } void __exit vlan_netlink_fini(void) { rtnl_link_unregister(&vlan_link_ops); } MODULE_ALIAS_RTNL_LINK("vlan");
413 453 245 12 2239 26 21918 109 107 8 20070 913 385 3743 2293 285 5 1 22218 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/mm_types.h> #include <linux/ucopysize.h> #include <uapi/linux/uio.h> struct page; struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_UBUF, ITER_IOVEC, ITER_BVEC, ITER_KVEC, ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; #define ITER_SOURCE 1 // == WRITE #define ITER_DEST 0 // == READ struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool nofault; bool data_source; size_t iov_offset; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type * of iterator used. This means that you can use: * * &iter->__ubuf_iovec or iter->__iov * * interchangably for the user_backed cases, hence simplifying * some of the cases that need to deal with both. */ union { /* * This really should be a const, but we cannot do that without * also modifying any of the zero-filling iter init functions. * Leave it non-const for now, but it should be treated as such. */ struct iovec __ubuf_iovec; struct { union { /* use iter_iov() to get the current vec */ const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; size_t count; }; }; union { unsigned long nr_segs; u8 folioq_slot; loff_t xarray_start; }; }; typedef __u16 uio_meta_flags_t; struct uio_meta { uio_meta_flags_t flags; u16 app_tag; u64 seed; struct iov_iter iter; }; static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) return (const struct iovec *) &iter->__ubuf_iovec; return iter->__iov; } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) static inline size_t iter_iov_len(const struct iov_iter *i) { if (i->iter_type == ITER_UBUF) return i->count; return iter_iov(i)->iov_len - i->iov_offset; } static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_ubuf(const struct iov_iter *i) { return iov_iter_type(i) == ITER_UBUF; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_folioq(const struct iov_iter *i) { return iov_iter_type(i) == ITER_FOLIOQ; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } static inline bool user_backed_iter(const struct iov_iter *i) { return iter_is_ubuf(i) || iter_is_iovec(i); } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_to_iter(&folio->page, offset, bytes, i); } static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_from_iter(&folio->page, offset, bytes, i); } size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_to_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_mc_to_iter _copy_to_iter #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } static inline int iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) { size_t shorted = 0; int npages; if (iov_iter_count(i) > max_bytes) { shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); return npages; } struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .data_source = direction, .ubuf = buf, .count = count, .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ /* Allow P2PDMA on the extracted pages */ #define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained * @iter: The iterator * * Examine the iterator and indicate by returning true or false as to how, if * at all, pages extracted from the iterator will be retained by the extraction * function. * * %true indicates that the pages will have a pin placed in them that the * caller must unpin. This is must be done for DMA/async DIO to force fork() * to forcibly copy a page for the child (the parent must retain the original * page). * * %false indicates that no measures are taken and that it's up to the caller * to retain the pages. */ static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) { return user_backed_iter(iter); } struct sg_table; ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags); #endif
27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/eventfd.h * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _LINUX_EVENTFD_H #define _LINUX_EVENTFD_H #include <linux/wait.h> #include <linux/err.h> #include <linux/percpu-defs.h> #include <linux/percpu.h> #include <linux/sched.h> #include <uapi/linux/eventfd.h> /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want * to re-use O_* flags that couldn't possibly have a meaning * from eventfd, in order to leave a free define-space for * shared O_* flags. */ #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) struct eventfd_ctx; struct file; #ifdef CONFIG_EVENTFD void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); static inline bool eventfd_signal_allowed(void) { return !current->in_eventfd; } #else /* CONFIG_EVENTFD */ /* * Ugly ugly ugly error layer to support modules that uses eventfd but * pretend to work in !CONFIG_EVENTFD configurations. Namely, AIO. */ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) { return ERR_PTR(-ENOSYS); } static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { } static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) { } static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { return -ENOSYS; } static inline bool eventfd_signal_allowed(void) { return true; } static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { } #endif static inline void eventfd_signal(struct eventfd_ctx *ctx) { eventfd_signal_mask(ctx, 0); } #endif /* _LINUX_EVENTFD_H */
56 1395 645 645 598 269 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H #include <linux/mm_types.h> #include <linux/mmu_notifier.h> #include <linux/sched.h> #include <asm/barrier.h> #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> #include <asm/smp.h> #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h> #include <asm/pgtable.h> DECLARE_PER_CPU(u64, tlbstate_untag_mask); void __flush_tlb_all(void); #define TLB_FLUSH_ALL -1UL #define TLB_GENERATION_INVALID 0 void cr4_update_irqsoff(unsigned long set, unsigned long clear); unsigned long cr4_read_shadow(void); /* Set in this cpu's CR4. */ static inline void cr4_set_bits_irqsoff(unsigned long mask) { cr4_update_irqsoff(mask, 0); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits_irqsoff(unsigned long mask) { cr4_update_irqsoff(0, mask); } /* Set in this cpu's CR4. */ static inline void cr4_set_bits(unsigned long mask) { unsigned long flags; local_irq_save(flags); cr4_set_bits_irqsoff(mask); local_irq_restore(flags); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits(unsigned long mask) { unsigned long flags; local_irq_save(flags); cr4_clear_bits_irqsoff(mask); local_irq_restore(flags); } #ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache * lines. */ #define TLB_NR_DYN_ASIDS 6 struct tlb_context { u64 ctx_id; u64 tlb_gen; }; struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. * * During switch_mm_irqs_off(), loaded_mm will be set to * LOADED_MM_SWITCHING during the brief interrupts-off window * when CR3 and loaded_mm would otherwise be inconsistent. This * is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; #define LOADED_MM_SWITCHING ((struct mm_struct *)1UL) /* Last user mm for optimizing IBPB */ union { struct mm_struct *last_user_mm; unsigned long last_user_mm_spec; }; u16 loaded_mm_asid; u16 next_asid; /* * If set we changed the page tables in such a way that we * needed an invalidation of all contexts (aka. PCIDs / ASIDs). * This tells us to go invalidate all the non-loaded ctxs[] * on the next context switch. * * The current ctx was kept up-to-date as it ran and does not * need to be invalidated. */ bool invalidate_other; #ifdef CONFIG_ADDRESS_MASKING /* * Active LAM mode. * * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM * disabled. */ u8 lam; #endif /* * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate * the corresponding user PCID needs a flush next time we * switch to it; see SWITCH_TO_USER_CR3. */ unsigned short user_pcid_flush_mask; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; /* * This is a list of all contexts that might exist in the TLB. * There is one per ASID that we use, and the ASID (what the * CPU calls PCID) is the index into ctxts. * * For each context, ctx_id indicates which mm the TLB's user * entries came from. As an invariant, the TLB will never * contain entries that are out-of-date as when that mm reached * the tlb_gen in the list. * * To be clear, this means that it's legal for the TLB code to * flush the TLB without updating tlb_gen. This can happen * (for now, at least) due to paravirt remote flushes. * * NB: context 0 is a bit special, since it's also used by * various bits of init code. This is fine -- code that * isn't aware of PCID will end up harmlessly flushing * context 0. */ struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate); struct tlb_state_shared { /* * We can be in one of several states: * * - Actively using an mm. Our CPU's bit will be set in * mm_cpumask(loaded_mm) and is_lazy == false; * * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit * will not be set in mm_cpumask(&init_mm) and is_lazy == false. * * - Lazily using a real mm. loaded_mm != &init_mm, our bit * is set in mm_cpumask(loaded_mm), but is_lazy == true. * We're heuristically guessing that the CR3 load we * skipped more than makes up for the overhead added by * lazy mode. */ bool is_lazy; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); bool nmi_uaccess_okay(void); #define nmi_uaccess_okay nmi_uaccess_okay /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; /* How many pages can be invalidated with one INVLPGB. */ extern u16 invlpgb_count_max; extern void initialize_tlbstate_and_flush(void); /* * TLB flushing: * * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages * - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { /* * We support several kinds of flushes. * * - Fully flush a single mm. .mm will be set, .end will be * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to * which the IPI sender is trying to catch us up. * * - Partially flush a single mm. .mm will be set, .start and * .end will indicate the range, and .new_tlb_gen will be set * such that the changes between generation .new_tlb_gen-1 and * .new_tlb_gen are entirely contained in the indicated range. * * - Fully flush all mms whose tlb_gens have been updated. .mm * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen * will be zero. */ struct mm_struct *mm; unsigned long start; unsigned long end; u64 new_tlb_gen; unsigned int initiating_cpu; u8 stride_shift; u8 freed_tables; u8 trim_cpumask; }; void flush_tlb_local(void); void flush_tlb_one_user(unsigned long addr); void flush_tlb_one_kernel(unsigned long addr); void flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline bool is_dyn_asid(u16 asid) { return asid < TLB_NR_DYN_ASIDS; } static inline bool is_global_asid(u16 asid) { return !is_dyn_asid(asid); } #ifdef CONFIG_BROADCAST_TLB_FLUSH static inline u16 mm_global_asid(struct mm_struct *mm) { u16 asid; if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) return 0; asid = smp_load_acquire(&mm->context.global_asid); /* mm->context.global_asid is either 0, or a global ASID */ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); return asid; } static inline void mm_init_global_asid(struct mm_struct *mm) { if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { mm->context.global_asid = 0; mm->context.asid_transition = false; } } static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { /* * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> * finish_asid_transition() needs to observe asid_transition = true * once it observes global_asid. */ mm->context.asid_transition = true; smp_store_release(&mm->context.global_asid, asid); } static inline void mm_clear_asid_transition(struct mm_struct *mm) { WRITE_ONCE(mm->context.asid_transition, false); } static inline bool mm_in_asid_transition(struct mm_struct *mm) { if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) return false; return mm && READ_ONCE(mm->context.asid_transition); } #else static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } static inline void mm_init_global_asid(struct mm_struct *mm) { } static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } static inline void mm_clear_asid_transition(struct mm_struct *mm) { } static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } #endif /* CONFIG_BROADCAST_TLB_FLUSH */ #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif #define flush_tlb_mm(mm) \ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) #define flush_tlb_range(vma, start, end) \ flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ : PAGE_SHIFT, true) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { bool should_defer = false; /* If remote CPUs need to be flushed then defer batch the flush */ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) should_defer = true; put_cpu(); return should_defer; } static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* * Bump the generation count. This also serves as a full barrier * that synchronizes with switch_mm(): callers are required to order * their read of mm_cpumask after their writes to the paging * structures. */ return atomic64_inc_return(&mm->context.tlb_gen); } static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm, unsigned long start, unsigned long end) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); batch->unmapped_pages = true; mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { flush_tlb_mm(mm); } extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); static inline bool pte_flags_need_flush(unsigned long oldflags, unsigned long newflags, bool ignore_access) { /* * Flags that require a flush when cleared but not when they are set. * Only include flags that would not trigger spurious page-faults. * Non-present entries are not cached. Hardware would set the * dirty/access bit if needed without a fault. */ const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED; const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3 | _PAGE_SOFTW4 | _PAGE_SAVED_DIRTY; const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX; unsigned long diff = oldflags ^ newflags; BUILD_BUG_ON(flush_on_clear & software_flags); BUILD_BUG_ON(flush_on_clear & flush_on_change); BUILD_BUG_ON(flush_on_change & software_flags); /* Ignore software flags */ diff &= ~software_flags; if (ignore_access) diff &= ~_PAGE_ACCESSED; /* * Did any of the 'flush_on_clear' flags was clleared set from between * 'oldflags' and 'newflags'? */ if (diff & oldflags & flush_on_clear) return true; /* Flush on modified flags. */ if (diff & flush_on_change) return true; /* Ensure there are no flags that were left behind */ if (IS_ENABLED(CONFIG_DEBUG_VM) && (diff & ~(flush_on_clear | software_flags | flush_on_change))) { VM_WARN_ON_ONCE(1); return true; } return false; } /* * pte_needs_flush() checks whether permissions were demoted and require a * flush. It should only be used for userspace PTEs. */ static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { /* !PRESENT -> * ; no need for flush */ if (!(pte_flags(oldpte) & _PAGE_PRESENT)) return false; /* PFN changed ; needs flush */ if (pte_pfn(oldpte) != pte_pfn(newpte)) return true; /* * check PTE flags; ignore access-bit; see comment in * ptep_clear_flush_young(). */ return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte), true); } #define pte_needs_flush pte_needs_flush /* * huge_pmd_needs_flush() checks whether permissions were demoted and require a * flush. It should only be used for userspace huge PMDs. */ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { /* !PRESENT -> * ; no need for flush */ if (!(pmd_flags(oldpmd) & _PAGE_PRESENT)) return false; /* PFN changed ; needs flush */ if (pmd_pfn(oldpmd) != pmd_pfn(newpmd)) return true; /* * check PMD flags; do not ignore access-bit; see * pmdp_clear_flush_young(). */ return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd), false); } #define huge_pmd_needs_flush huge_pmd_needs_flush #ifdef CONFIG_ADDRESS_MASKING static inline u64 tlbstate_lam_cr3_mask(void) { u64 lam = this_cpu_read(cpu_tlbstate.lam); return lam << X86_CR3_LAM_U57_BIT; } static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); this_cpu_write(tlbstate_untag_mask, untag_mask); } #else static inline u64 tlbstate_lam_cr3_mask(void) { return 0; } static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { } #endif #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) { native_write_cr4(cr4 ^ X86_CR4_PGE); native_write_cr4(cr4); } #endif /* _ASM_X86_TLBFLUSH_H */
498 498 145 145 156 4 4 4 4 3 4 4 4 4 4 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-or-later /****************************************************************************** * vlanproc.c VLAN Module. /proc filesystem interface. * * This module is completely hardware-independent and provides * access to the router using Linux /proc filesystem. * * Author: Ben Greear, <greearb@candelatech.com> coppied from wanproc.c * by: Gene Kozin <genek@compuserve.com> * * Copyright: (c) 1998 Ben Greear * * ============================================================================ * Jan 20, 1998 Ben Greear Initial Version *****************************************************************************/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "vlanproc.h" #include "vlan.h" /****** Function Prototypes *************************************************/ /* Methods for preparing data for reading proc entries */ static int vlan_seq_show(struct seq_file *seq, void *v); static void *vlan_seq_start(struct seq_file *seq, loff_t *pos); static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos); static void vlan_seq_stop(struct seq_file *seq, void *); static int vlandev_seq_show(struct seq_file *seq, void *v); /* * Global Data */ /* * Names of the proc directory entries */ static const char name_root[] = "vlan"; static const char name_conf[] = "config"; /* * Structures for interfacing with the /proc filesystem. * VLAN creates its own directory /proc/net/vlan with the following * entries: * config device status/configuration * <device> entry for each device */ /* * Generic /proc/net/vlan/<file> file and inode operations */ static const struct seq_operations vlan_seq_ops = { .start = vlan_seq_start, .next = vlan_seq_next, .stop = vlan_seq_stop, .show = vlan_seq_show, }; /* * Proc filesystem directory entries. */ /* Strings */ static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", }; /* * Interface functions */ /* * Clean up /proc/net/vlan entries */ void vlan_proc_cleanup(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); if (vn->proc_vlan_conf) remove_proc_entry(name_conf, vn->proc_vlan_dir); if (vn->proc_vlan_dir) remove_proc_entry(name_root, net->proc_net); /* Dynamically added entries should be cleaned up as their vlan_device * is removed, so we should not have to take care of it here... */ } /* * Create /proc/net/vlan entries */ int __net_init vlan_proc_init(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net); if (!vn->proc_vlan_dir) goto err; vn->proc_vlan_conf = proc_create_net(name_conf, S_IFREG | 0600, vn->proc_vlan_dir, &vlan_seq_ops, sizeof(struct seq_net_private)); if (!vn->proc_vlan_conf) goto err; return 0; err: pr_err("can't create entry in proc filesystem!\n"); vlan_proc_cleanup(net); return -ENOBUFS; } /* * Add directory entry for VLAN device. */ int vlan_proc_add_dev(struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); if (!strcmp(vlandev->name, name_conf)) return -EINVAL; vlan->dent = proc_create_single_data(vlandev->name, S_IFREG | 0600, vn->proc_vlan_dir, vlandev_seq_show, vlandev); if (!vlan->dent) return -ENOBUFS; return 0; } /* * Delete directory entry for VLAN device. */ void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; } /****** Proc filesystem entry points ****************************************/ /* * The following few functions build the content of /proc/net/vlan/config */ static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos) { unsigned long ifindex = *pos; struct net_device *dev; for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { if (!is_vlan_dev(dev)) continue; *pos = dev->ifindex; return dev; } return NULL; } static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; return vlan_seq_from_index(seq, pos); } static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return vlan_seq_from_index(seq, pos); } static void vlan_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } static int vlan_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); struct vlan_net *vn = net_generic(net, vlan_net_id); if (v == SEQ_START_TOKEN) { const char *nmtype = NULL; seq_puts(seq, "VLAN Dev name | VLAN ID\n"); if (vn->name_type < ARRAY_SIZE(vlan_name_type_str)) nmtype = vlan_name_type_str[vn->name_type]; seq_printf(seq, "Name-Type: %s\n", nmtype ? nmtype : "UNKNOWN"); } else { const struct net_device *vlandev = v; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); seq_printf(seq, "%-15s| %d | %s\n", vlandev->name, vlan->vlan_id, vlan->real_dev->name); } return 0; } static int vlandev_seq_show(struct seq_file *seq, void *offset) { struct net_device *vlandev = (struct net_device *) seq->private; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats; static const char fmt64[] = "%30s %12llu\n"; int i; if (!is_vlan_dev(vlandev)) return 0; stats = dev_get_stats(vlandev, &temp); seq_printf(seq, "%s VID: %d REORDER_HDR: %i dev->priv_flags: %x\n", vlandev->name, vlan->vlan_id, (int)(vlan->flags & 1), (u32)vlandev->priv_flags); seq_printf(seq, fmt64, "total frames received", stats->rx_packets); seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes); seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast); seq_puts(seq, "\n"); seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets); seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes); seq_printf(seq, "Device: %s", vlan->real_dev->name); /* now show all PRIORITY mappings relating to this VLAN */ seq_printf(seq, "\nINGRESS priority mappings: " "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n", vlan->ingress_priority_map[0], vlan->ingress_priority_map[1], vlan->ingress_priority_map[2], vlan->ingress_priority_map[3], vlan->ingress_priority_map[4], vlan->ingress_priority_map[5], vlan->ingress_priority_map[6], vlan->ingress_priority_map[7]); seq_printf(seq, " EGRESS priority mappings: "); for (i = 0; i < 16; i++) { const struct vlan_priority_tci_mapping *mp = vlan->egress_priority_map[i]; while (mp) { seq_printf(seq, "%u:%d ", mp->priority, ((mp->vlan_qos >> 13) & 0x7)); mp = mp->next; } } seq_puts(seq, "\n"); return 0; }
5152 8566 813 17149 7986 8092 455 789 91 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 // SPDX-License-Identifier: GPL-2.0-only /* * Netlink message type permission tables, for user generated messages. * * Author: James Morris <jmorris@redhat.com> * * Copyright (C) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> #include <linux/if.h> #include <linux/inet_diag.h> #include <linux/xfrm.h> #include <linux/audit.h> #include <linux/sock_diag.h> #include "flask.h" #include "av_permissions.h" #include "security.h" struct nlmsg_perm { u16 nlmsg_type; u32 perm; }; static const struct nlmsg_perm nlmsg_route_perms[] = { { RTM_NEWLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETLINK, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_SETLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_NEWADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETADDR, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETROUTE, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETRULE, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETQDISC, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETACTION, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWPREFIX, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETMULTICAST, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_GETANYCAST, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_GETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_SETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_NEWADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_GETDCB, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_SETDCB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_NEWNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETMDB, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWNSID, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_GETNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_GETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_SETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_NEWCACHEREPORT, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = { { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DESTROY, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE }, }; static const struct nlmsg_perm nlmsg_xfrm_perms[] = { { XFRM_MSG_NEWSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_DELSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_GETSA, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_NEWPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_DELPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_GETPOLICY, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_ALLOCSPI, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_ACQUIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_EXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_UPDPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_UPDSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_POLEXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_FLUSHSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_FLUSHPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_NEWAE, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_GETAE, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_REPORT, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_MIGRATE, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_NEWSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_GETSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_NEWSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_GETSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_MAPPING, NETLINK_XFRM_SOCKET__NLMSG_READ }, { XFRM_MSG_SETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_WRITE }, { XFRM_MSG_GETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_audit_perms[] = { { AUDIT_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ }, { AUDIT_SET, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_LIST, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV }, { AUDIT_ADD, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_DEL, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_LIST_RULES, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV }, { AUDIT_ADD_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_DEL_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_USER, NETLINK_AUDIT_SOCKET__NLMSG_RELAY }, { AUDIT_SIGNAL_INFO, NETLINK_AUDIT_SOCKET__NLMSG_READ }, { AUDIT_TRIM, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_MAKE_EQUIV, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_TTY_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ }, { AUDIT_TTY_SET, NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT }, { AUDIT_GET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_READ }, { AUDIT_SET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, }; static int nlmsg_perm(u16 nlmsg_type, u32 *perm, const struct nlmsg_perm *tab, size_t tabsize) { unsigned int i; int err = -EINVAL; for (i = 0; i < tabsize / sizeof(struct nlmsg_perm); i++) if (nlmsg_type == tab[i].nlmsg_type) { *perm = tab[i].perm; err = 0; break; } return err; } int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) { /* While it is possible to add a similar permission to other netlink * classes, note that the extended permission value is matched against * the nlmsg_type field. Notably, SECCLASS_NETLINK_GENERIC_SOCKET uses * dynamic values for this field, which means that it cannot be added * as-is. */ switch (sclass) { case SECCLASS_NETLINK_ROUTE_SOCKET: /* RTM_MAX always points to RTM_SETxxxx, ie RTM_NEWxxx + 3. * If the BUILD_BUG_ON() below fails you must update the * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ BUILD_BUG_ON(RTM_MAX != (RTM_NEWTUNNEL + 3)); if (selinux_policycap_netlink_xperm()) { *perm = NETLINK_ROUTE_SOCKET__NLMSG; return 0; } return nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; case SECCLASS_NETLINK_TCPDIAG_SOCKET: if (selinux_policycap_netlink_xperm()) { *perm = NETLINK_TCPDIAG_SOCKET__NLMSG; return 0; } return nlmsg_perm(nlmsg_type, perm, nlmsg_tcpdiag_perms, sizeof(nlmsg_tcpdiag_perms)); break; case SECCLASS_NETLINK_XFRM_SOCKET: /* If the BUILD_BUG_ON() below fails you must update the * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_GETDEFAULT); if (selinux_policycap_netlink_xperm()) { *perm = NETLINK_XFRM_SOCKET__NLMSG; return 0; } return nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms, sizeof(nlmsg_xfrm_perms)); break; case SECCLASS_NETLINK_AUDIT_SOCKET: if (selinux_policycap_netlink_xperm()) { *perm = NETLINK_AUDIT_SOCKET__NLMSG; return 0; } else if ((nlmsg_type >= AUDIT_FIRST_USER_MSG && nlmsg_type <= AUDIT_LAST_USER_MSG) || (nlmsg_type >= AUDIT_FIRST_USER_MSG2 && nlmsg_type <= AUDIT_LAST_USER_MSG2)) { *perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY; return 0; } return nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms, sizeof(nlmsg_audit_perms)); break; } /* No messaging from userspace, or class unknown/unhandled */ return -ENOENT; }
41 36 36 36 39 40 40 40 4 4 41 40 31 26 5 441 429 24 308 272 209 188 1 1 4 116 9 7 2 10 9 120 10 111 4 107 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/switchdev.h> #include "br_private.h" static struct static_key_false br_switchdev_tx_fwd_offload; static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, const struct sk_buff *skb) { if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) return false; return (p->flags & BR_TX_FWD_OFFLOAD) && (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); } bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) return false; return BR_INPUT_SKB_CB(skb)->tx_fwd_offload; } void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) { skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb); } /* Mark the frame for TX forwarding offload if this egress port supports it */ void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, struct sk_buff *skb) { if (nbp_switchdev_can_offload_tx_fwd(p, skb)) BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true; } /* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms * that the skb has been already forwarded to, to avoid further cloning to * other ports in the same hwdom by making nbp_switchdev_allowed_egress() * return false. */ void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, struct sk_buff *skb) { if (nbp_switchdev_can_offload_tx_fwd(p, skb)) set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms); } void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { if (p->hwdom) BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom; } bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb); return !test_bit(p->hwdom, &cb->fwd_hwdoms) && (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom); } /* Flags that can be offloaded to hardware */ #define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | BR_PORT_MAB | \ BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \ BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST) int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->dev, }; struct switchdev_notifier_port_attr_info info = { .attr = &attr, }; int err; mask &= BR_PORT_FLAGS_HW_OFFLOAD; if (!mask) return 0; attr.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS; attr.u.brport_flags.val = flags; attr.u.brport_flags.mask = mask; /* We run from atomic context here */ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev, &info.info, extack); err = notifier_to_errno(err); if (err == -EOPNOTSUPP) return 0; if (err) { NL_SET_ERR_MSG_WEAK_MOD(extack, "bridge flag offload is not supported"); return -EOPNOTSUPP; } attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS; attr.flags = SWITCHDEV_F_DEFER; err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { NL_SET_ERR_MSG_WEAK_MOD(extack, "error setting offload flag on port"); return err; } return 0; } static void br_switchdev_fdb_populate(struct net_bridge *br, struct switchdev_notifier_fdb_info *item, const struct net_bridge_fdb_entry *fdb, const void *ctx) { const struct net_bridge_port *p = READ_ONCE(fdb->dst); item->addr = fdb->key.addr.addr; item->vid = fdb->key.vlan_id; item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); item->locked = false; item->info.dev = (!p || item->is_local) ? br->dev : p->dev; item->info.ctx = ctx; } void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type) { struct switchdev_notifier_fdb_info item; if (test_bit(BR_FDB_LOCKED, &fdb->flags)) return; /* Entries with these flags were created using ndm_state == NUD_REACHABLE, * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something * equivalent to 'bridge fdb add ... master dynamic (sticky)'. * Drivers don't know how to deal with these, so don't notify them to * avoid confusing them. */ if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) && !test_bit(BR_FDB_STATIC, &fdb->flags) && !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) return; br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) { case RTM_DELNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, item.info.dev, &item.info, NULL); break; case RTM_NEWNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, item.info.dev, &item.info, NULL); break; } } int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, bool changed, struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, .vid = vid, .changed = changed, }; return switchdev_port_obj_add(dev, &v.obj, extack); } int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, }; return switchdev_port_obj_del(dev, &v.obj); } static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining) { struct net_bridge *br = joining->br; struct net_bridge_port *p; int hwdom; /* joining is yet to be added to the port list. */ list_for_each_entry(p, &br->port_list, list) { if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) { joining->hwdom = p->hwdom; return 0; } } hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1); if (hwdom >= BR_HWDOM_MAX) return -EBUSY; set_bit(hwdom, &br->busy_hwdoms); joining->hwdom = hwdom; return 0; } static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving) { struct net_bridge *br = leaving->br; struct net_bridge_port *p; /* leaving is no longer in the port list. */ list_for_each_entry(p, &br->port_list, list) { if (p->hwdom == leaving->hwdom) return; } clear_bit(leaving->hwdom, &br->busy_hwdoms); } static int nbp_switchdev_add(struct net_bridge_port *p, struct netdev_phys_item_id ppid, bool tx_fwd_offload, struct netlink_ext_ack *extack) { int err; if (p->offload_count) { /* Prevent unsupported configurations such as a bridge port * which is a bonding interface, and the member ports are from * different hardware switches. */ if (!netdev_phys_item_id_same(&p->ppid, &ppid)) { NL_SET_ERR_MSG_MOD(extack, "Same bridge port cannot be offloaded by two physical switches"); return -EBUSY; } /* Tolerate drivers that call switchdev_bridge_port_offload() * more than once for the same bridge port, such as when the * bridge port is an offloaded bonding/team interface. */ p->offload_count++; return 0; } p->ppid = ppid; p->offload_count = 1; err = nbp_switchdev_hwdom_set(p); if (err) return err; if (tx_fwd_offload) { p->flags |= BR_TX_FWD_OFFLOAD; static_branch_inc(&br_switchdev_tx_fwd_offload); } return 0; } static void nbp_switchdev_del(struct net_bridge_port *p) { if (WARN_ON(!p->offload_count)) return; p->offload_count--; if (p->offload_count) return; if (p->hwdom) nbp_switchdev_hwdom_put(p); if (p->flags & BR_TX_FWD_OFFLOAD) { p->flags &= ~BR_TX_FWD_OFFLOAD; static_branch_dec(&br_switchdev_tx_fwd_offload); } } static int br_switchdev_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb, const struct net_bridge_fdb_entry *fdb, unsigned long action, const void *ctx) { struct switchdev_notifier_fdb_info item; int err; br_switchdev_fdb_populate(br, &item, fdb, ctx); err = nb->notifier_call(nb, action, &item); return notifier_to_errno(err); } static int br_switchdev_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; unsigned long action; int err = 0; if (!nb) return 0; if (!netif_is_bridge_master(br_dev)) return -EINVAL; br = netdev_priv(br_dev); if (adding) action = SWITCHDEV_FDB_ADD_TO_DEVICE; else action = SWITCHDEV_FDB_DEL_TO_DEVICE; rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { err = br_switchdev_fdb_replay_one(br, nb, fdb, action, ctx); if (err) break; } rcu_read_unlock(); return err; } static int br_switchdev_vlan_attr_replay(struct net_device *br_dev, const void *ctx, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_attr_info attr_info = { .info = { .dev = br_dev, .extack = extack, .ctx = ctx, }, }; struct net_bridge *br = netdev_priv(br_dev); struct net_bridge_vlan_group *vg; struct switchdev_attr attr; struct net_bridge_vlan *v; int err; attr_info.attr = &attr; attr.orig_dev = br_dev; vg = br_vlan_group(br); if (!vg) return 0; list_for_each_entry(v, &vg->vlan_list, vlist) { if (v->msti) { attr.id = SWITCHDEV_ATTR_ID_VLAN_MSTI; attr.u.vlan_msti.vid = v->vid; attr.u.vlan_msti.msti = v->msti; err = nb->notifier_call(nb, SWITCHDEV_PORT_ATTR_SET, &attr_info); err = notifier_to_errno(err); if (err) return err; } } return 0; } static int br_switchdev_vlan_replay_one(struct notifier_block *nb, struct net_device *dev, struct switchdev_obj_port_vlan *vlan, const void *ctx, unsigned long action, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, .ctx = ctx, }, .obj = &vlan->obj, }; int err; err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } static int br_switchdev_vlan_replay_group(struct notifier_block *nb, struct net_device *dev, struct net_bridge_vlan_group *vg, const void *ctx, unsigned long action, struct netlink_ext_ack *extack) { struct net_bridge_vlan *v; int err = 0; u16 pvid; if (!vg) return 0; pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct switchdev_obj_port_vlan vlan = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = br_vlan_flags(v, pvid), .vid = v->vid, }; if (!br_vlan_should_use(v)) continue; err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx, action, extack); if (err) return err; } return 0; } static int br_switchdev_vlan_replay(struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(br_dev); struct net_bridge_port *p; unsigned long action; int err; ASSERT_RTNL(); if (!nb) return 0; if (!netif_is_bridge_master(br_dev)) return -EINVAL; if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br), ctx, action, extack); if (err) return err; list_for_each_entry(p, &br->port_list, list) { struct net_device *dev = p->dev; err = br_switchdev_vlan_replay_group(nb, dev, nbp_vlan_group(p), ctx, action, extack); if (err) return err; } if (adding) { err = br_switchdev_vlan_attr_replay(br_dev, ctx, nb, extack); if (err) return err; } return 0; } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct br_switchdev_mdb_complete_info { struct net_bridge_port *port; struct br_ip ip; }; static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *priv) { struct br_switchdev_mdb_complete_info *data = priv; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; u8 old_flags; if (err == -EOPNOTSUPP) goto out_free; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); if (!mp) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port != port) continue; old_flags = p->flags; br_multicast_set_pg_offload_flags(p, !err); if (br_mdb_should_notify(br, old_flags ^ p->flags)) br_mdb_flag_change_notify(br->dev, mp, p); } out: spin_unlock_bh(&br->multicast_lock); out_free: kfree(priv); } static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, const struct net_bridge_mdb_entry *mp) { if (mp->addr.proto == htons(ETH_P_IP)) ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); #if IS_ENABLED(CONFIG_IPV6) else if (mp->addr.proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); #endif else ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); mdb->vid = mp->addr.vid; } static void br_switchdev_host_mdb_one(struct net_device *dev, struct net_device *lower_dev, struct net_bridge_mdb_entry *mp, int type) { struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_HOST_MDB, .flags = SWITCHDEV_F_DEFER, .orig_dev = dev, }, }; br_switchdev_mdb_populate(&mdb, mp); switch (type) { case RTM_NEWMDB: switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); break; case RTM_DELMDB: switchdev_port_obj_del(lower_dev, &mdb.obj); break; } } static void br_switchdev_host_mdb(struct net_device *dev, struct net_bridge_mdb_entry *mp, int type) { struct net_device *lower_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, lower_dev, iter) br_switchdev_host_mdb_one(dev, lower_dev, mp, type); } static int br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, const struct switchdev_obj_port_mdb *mdb, unsigned long action, const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, .ctx = ctx, }, .obj = &mdb->obj, }; int err; err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } static int br_switchdev_mdb_queue_one(struct list_head *mdb_list, struct net_device *dev, unsigned long action, enum switchdev_obj_id id, const struct net_bridge_mdb_entry *mp, struct net_device *orig_dev) { struct switchdev_obj_port_mdb mdb = { .obj = { .id = id, .orig_dev = orig_dev, }, }; struct switchdev_obj_port_mdb *pmdb; br_switchdev_mdb_populate(&mdb, mp); if (action == SWITCHDEV_PORT_OBJ_ADD && switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) { /* This event is already in the deferred queue of * events, so this replay must be elided, lest the * driver receives duplicate events for it. This can * only happen when replaying additions, since * modifications are always immediately visible in * br->mdb_list, whereas actual event delivery may be * delayed. */ return 0; } pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC); if (!pmdb) return -ENOMEM; list_add_tail(&pmdb->obj.list, mdb_list); return 0; } void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct br_switchdev_mdb_complete_info *complete_info; struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_PORT_MDB, .flags = SWITCHDEV_F_DEFER, }, }; if (!pg) return br_switchdev_host_mdb(dev, mp, type); br_switchdev_mdb_populate(&mdb, mp); mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); if (!complete_info) break; complete_info->port = pg->key.port; complete_info->ip = mp->addr; mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_switchdev_mdb_complete; if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL)) kfree(complete_info); break; case RTM_DELMDB: switchdev_port_obj_del(pg->key.port->dev, &mdb.obj); break; } } #endif static int br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING const struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; struct net_bridge *br; unsigned long action; LIST_HEAD(mdb_list); int err = 0; ASSERT_RTNL(); if (!nb) return 0; if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) return -EINVAL; br = netdev_priv(br_dev); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; /* br_switchdev_mdb_queue_one() will take care to not queue a * replay of an event that is already pending in the switchdev * deferred queue. In order to safely determine that, there * must be no new deferred MDB notifications enqueued for the * duration of the MDB scan. Therefore, grab the write-side * lock to avoid racing with any concurrent IGMP/MLD snooping. */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group __rcu * const *pp; const struct net_bridge_port_group *p; if (mp->host_joined) { err = br_switchdev_mdb_queue_one(&mdb_list, dev, action, SWITCHDEV_OBJ_ID_HOST_MDB, mp, br_dev); if (err) { spin_unlock_bh(&br->multicast_lock); goto out_free_mdb; } } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port->dev != dev) continue; err = br_switchdev_mdb_queue_one(&mdb_list, dev, action, SWITCHDEV_OBJ_ID_PORT_MDB, mp, dev); if (err) { spin_unlock_bh(&br->multicast_lock); goto out_free_mdb; } } } spin_unlock_bh(&br->multicast_lock); list_for_each_entry(obj, &mdb_list, list) { err = br_switchdev_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); if (err == -EOPNOTSUPP) err = 0; if (err) goto out_free_mdb; } out_free_mdb: list_for_each_entry_safe(obj, tmp, &mdb_list, list) { list_del(&obj->list); kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); } if (err) return err; #endif return 0; } static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; int err; err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); if (err) { /* -EOPNOTSUPP not propagated from MDB replay. */ return err; } err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); if (err && err != -EOPNOTSUPP) return err; return 0; } static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL); /* Make sure that the device leaving this bridge has seen all * relevant events before it is disassociated. In the normal * case, when the device is directly attached to the bridge, * this is covered by del_nbp(). If the association was indirect * however, e.g. via a team or bond, and the device is leaving * that intermediate device, then the bridge port remains in * place. */ switchdev_deferred_process(); } /* Let the bridge know that this port is offloaded, so that it can assign a * switchdev hardware domain to it. */ int br_switchdev_port_offload(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack) { struct netdev_phys_item_id ppid; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) return err; err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack); if (err) return err; err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); if (err) goto out_switchdev_del; return 0; out_switchdev_del: nbp_switchdev_del(p); return err; } void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb); nbp_switchdev_del(p); } int br_switchdev_port_replay(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); }
69 69 69 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 /* * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/completion.h> #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/random.h> #include <rdma/ib_cache.h> #include "sa.h" static int mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { .name = "ib_multicast", .add = mcast_add_one, .remove = mcast_remove_one }; static struct ib_sa_client sa_client; static struct workqueue_struct *mcast_wq; static union ib_gid mgid0; struct mcast_device; struct mcast_port { struct mcast_device *dev; spinlock_t lock; struct rb_root table; refcount_t refcount; struct completion comp; u32 port_num; }; struct mcast_device { struct ib_device *device; struct ib_event_handler event_handler; int start_port; int end_port; struct mcast_port port[]; }; enum mcast_state { MCAST_JOINING, MCAST_MEMBER, MCAST_ERROR, }; enum mcast_group_state { MCAST_IDLE, MCAST_BUSY, MCAST_GROUP_ERROR, MCAST_PKEY_EVENT }; enum { MCAST_INVALID_PKEY_INDEX = 0xFFFF }; struct mcast_member; struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; struct mcast_port *port; spinlock_t lock; struct work_struct work; struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; u16 pkey_index; u8 leave_state; int retries; }; struct mcast_member { struct ib_sa_multicast multicast; struct ib_sa_client *client; struct mcast_group *group; struct list_head list; enum mcast_state state; refcount_t refcount; struct completion comp; }; static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context); static struct mcast_group *mcast_find(struct mcast_port *port, union ib_gid *mgid) { struct rb_node *node = port->table.rb_node; struct mcast_group *group; int ret; while (node) { group = rb_entry(node, struct mcast_group, node); ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); if (!ret) return group; if (ret < 0) node = node->rb_left; else node = node->rb_right; } return NULL; } static struct mcast_group *mcast_insert(struct mcast_port *port, struct mcast_group *group, int allow_duplicates) { struct rb_node **link = &port->table.rb_node; struct rb_node *parent = NULL; struct mcast_group *cur_group; int ret; while (*link) { parent = *link; cur_group = rb_entry(parent, struct mcast_group, node); ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, sizeof group->rec.mgid); if (ret < 0) link = &(*link)->rb_left; else if (ret > 0) link = &(*link)->rb_right; else if (allow_duplicates) link = &(*link)->rb_left; else return cur_group; } rb_link_node(&group->node, parent, link); rb_insert_color(&group->node, &port->table); return NULL; } static void deref_port(struct mcast_port *port) { if (refcount_dec_and_test(&port->refcount)) complete(&port->comp); } static void release_group(struct mcast_group *group) { struct mcast_port *port = group->port; unsigned long flags; spin_lock_irqsave(&port->lock, flags); if (atomic_dec_and_test(&group->refcount)) { rb_erase(&group->node, &port->table); spin_unlock_irqrestore(&port->lock, flags); kfree(group); deref_port(port); } else spin_unlock_irqrestore(&port->lock, flags); } static void deref_member(struct mcast_member *member) { if (refcount_dec_and_test(&member->refcount)) complete(&member->comp); } static void queue_join(struct mcast_member *member) { struct mcast_group *group = member->group; unsigned long flags; spin_lock_irqsave(&group->lock, flags); list_add_tail(&member->list, &group->pending_list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } spin_unlock_irqrestore(&group->lock, flags); } /* * A multicast group has four types of members: full member, non member, * sendonly non member and sendonly full member. * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } /* * If a multicast group has zero members left for a particular join state, but * the group is still a member with the SA, we need to leave that join state. * Determine which join states we still belong to, but that do not have any * active members. */ static u8 get_leave_state(struct mcast_group *group) { u8 leave_state = 0; int i; for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); return leave_state & group->rec.join_state; } static int check_selector(ib_sa_comp_mask comp_mask, ib_sa_comp_mask selector_mask, ib_sa_comp_mask value_mask, u8 selector, u8 src_value, u8 dst_value) { int err; if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) return 0; switch (selector) { case IB_SA_GT: err = (src_value <= dst_value); break; case IB_SA_LT: err = (src_value >= dst_value); break; case IB_SA_EQ: err = (src_value != dst_value); break; default: err = 0; break; } return err; } static int cmp_rec(struct ib_sa_mcmember_rec *src, struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) { /* MGID must already match */ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID && memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, src->rate, dst->rate)) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, dst->packet_life_time_selector, src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && src->flow_label != dst->flow_label) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && src->hop_limit != dst->hop_limit) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope) return -EINVAL; /* join_state checked separately, proxy_join ignored */ return 0; } static int send_join(struct mcast_group *group, struct mcast_member *member) { struct mcast_port *port = group->port; int ret; group->last_join = member; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_MGMT_METHOD_SET, &member->multicast.rec, member->multicast.comp_mask, 3000, GFP_KERNEL, join_handler, group, &group->query); return (ret > 0) ? 0 : ret; } static int send_leave(struct mcast_group *group, u8 leave_state) { struct mcast_port *port = group->port; struct ib_sa_mcmember_rec rec; int ret; rec = group->rec; rec.join_state = leave_state; group->leave_state = leave_state; ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device, port->port_num, IB_SA_METHOD_DELETE, &rec, IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE, 3000, GFP_KERNEL, leave_handler, group, &group->query); return (ret > 0) ? 0 : ret; } static void join_group(struct mcast_group *group, struct mcast_member *member, u8 join_state) { member->state = MCAST_MEMBER; adjust_membership(group, join_state, 1); group->rec.join_state |= join_state; member->multicast.rec = group->rec; member->multicast.rec.join_state = join_state; list_move(&member->list, &group->active_list); } static int fail_join(struct mcast_group *group, struct mcast_member *member, int status) { spin_lock_irq(&group->lock); list_del_init(&member->list); spin_unlock_irq(&group->lock); return member->multicast.callback(status, &member->multicast); } static void process_group_error(struct mcast_group *group) { struct mcast_member *member; int ret = 0; u16 pkey_index; if (group->state == MCAST_PKEY_EVENT) ret = ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(group->rec.pkey), &pkey_index); spin_lock_irq(&group->lock); if (group->state == MCAST_PKEY_EVENT && !ret && group->pkey_index == pkey_index) goto out; while (!list_empty(&group->active_list)) { member = list_entry(group->active_list.next, struct mcast_member, list); refcount_inc(&member->refcount); list_del_init(&member->list); adjust_membership(group, member->multicast.rec.join_state, -1); member->state = MCAST_ERROR; spin_unlock_irq(&group->lock); ret = member->multicast.callback(-ENETRESET, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } group->rec.join_state = 0; out: group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); } static void mcast_work_handler(struct work_struct *work) { struct mcast_group *group; struct mcast_member *member; struct ib_sa_multicast *multicast; int status, ret; u8 join_state; group = container_of(work, typeof(*group), work); retest: spin_lock_irq(&group->lock); while (!list_empty(&group->pending_list) || (group->state != MCAST_BUSY)) { if (group->state != MCAST_BUSY) { spin_unlock_irq(&group->lock); process_group_error(group); goto retest; } member = list_entry(group->pending_list.next, struct mcast_member, list); multicast = &member->multicast; join_state = multicast->rec.join_state; refcount_inc(&member->refcount); if (join_state == (group->rec.join_state & join_state)) { status = cmp_rec(&group->rec, &multicast->rec, multicast->comp_mask); if (!status) join_group(group, member, join_state); else list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = multicast->callback(status, multicast); } else { spin_unlock_irq(&group->lock); status = send_join(group, member); if (!status) { deref_member(member); return; } ret = fail_join(group, member, status); } deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); spin_lock_irq(&group->lock); } join_state = get_leave_state(group); if (join_state) { group->rec.join_state &= ~join_state; spin_unlock_irq(&group->lock); if (send_leave(group, join_state)) goto retest; } else { group->state = MCAST_IDLE; spin_unlock_irq(&group->lock); release_group(group); } } /* * Fail a join request if it is still active - at the head of the pending queue. */ static void process_join_error(struct mcast_group *group, int status) { struct mcast_member *member; int ret; spin_lock_irq(&group->lock); member = list_entry(group->pending_list.next, struct mcast_member, list); if (group->last_join == member) { refcount_inc(&member->refcount); list_del_init(&member->list); spin_unlock_irq(&group->lock); ret = member->multicast.callback(status, &member->multicast); deref_member(member); if (ret) ib_sa_free_multicast(&member->multicast); } else spin_unlock_irq(&group->lock); } static void join_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; u16 pkey_index = MCAST_INVALID_PKEY_INDEX; if (status) process_join_error(group, status); else { int mgids_changed, is_mgid0; if (ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index)) pkey_index = MCAST_INVALID_PKEY_INDEX; spin_lock_irq(&group->port->lock); if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, sizeof(group->rec.mgid)); group->rec = *rec; if (mgids_changed) { rb_erase(&group->node, &group->port->table); is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, sizeof(mgid0)); mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } mcast_work_handler(&group->work); } static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, void *context) { struct mcast_group *group = context; if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; else mcast_work_handler(&group->work); } static struct mcast_group *acquire_group(struct mcast_port *port, union ib_gid *mgid, gfp_t gfp_mask) { struct mcast_group *group, *cur_group; unsigned long flags; int is_mgid0; is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); if (!is_mgid0) { spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) goto found; spin_unlock_irqrestore(&port->lock, flags); } group = kzalloc(sizeof *group, gfp_mask); if (!group) return NULL; group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; INIT_LIST_HEAD(&group->pending_list); INIT_LIST_HEAD(&group->active_list); INIT_WORK(&group->work, mcast_work_handler); spin_lock_init(&group->lock); spin_lock_irqsave(&port->lock, flags); cur_group = mcast_insert(port, group, is_mgid0); if (cur_group) { kfree(group); group = cur_group; } else refcount_inc(&port->refcount); found: atomic_inc(&group->refcount); spin_unlock_irqrestore(&port->lock, flags); return group; } /* * We serialize all join requests to a single group to make our lives much * easier. Otherwise, two users could try to join the same group * simultaneously, with different configurations, one could leave while the * join is in progress, etc., which makes locking around error recovery * difficult. */ struct ib_sa_multicast * ib_sa_join_multicast(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, gfp_t gfp_mask, int (*callback)(int status, struct ib_sa_multicast *multicast), void *context) { struct mcast_device *dev; struct mcast_member *member; struct ib_sa_multicast *multicast; int ret; dev = ib_get_client_data(device, &mcast_client); if (!dev) return ERR_PTR(-ENODEV); member = kmalloc(sizeof *member, gfp_mask); if (!member) return ERR_PTR(-ENOMEM); ib_sa_client_get(client); member->client = client; member->multicast.rec = *rec; member->multicast.comp_mask = comp_mask; member->multicast.callback = callback; member->multicast.context = context; init_completion(&member->comp); refcount_set(&member->refcount, 1); member->state = MCAST_JOINING; member->group = acquire_group(&dev->port[port_num - dev->start_port], &rec->mgid, gfp_mask); if (!member->group) { ret = -ENOMEM; goto err; } /* * The user will get the multicast structure in their callback. They * could then free the multicast structure before we can return from * this routine. So we save the pointer to return before queuing * any callback. */ multicast = &member->multicast; queue_join(member); return multicast; err: ib_sa_client_put(client); kfree(member); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_sa_join_multicast); void ib_sa_free_multicast(struct ib_sa_multicast *multicast) { struct mcast_member *member; struct mcast_group *group; member = container_of(multicast, struct mcast_member, multicast); group = member->group; spin_lock_irq(&group->lock); if (member->state == MCAST_MEMBER) adjust_membership(group, multicast->rec.join_state, -1); list_del_init(&member->list); if (group->state == MCAST_IDLE) { group->state = MCAST_BUSY; spin_unlock_irq(&group->lock); /* Continue to hold reference on group until callback */ queue_work(mcast_wq, &group->work); } else { spin_unlock_irq(&group->lock); release_group(group); } deref_member(member); wait_for_completion(&member->comp); ib_sa_client_put(member->client); kfree(member); } EXPORT_SYMBOL(ib_sa_free_multicast); int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num, union ib_gid *mgid, struct ib_sa_mcmember_rec *rec) { struct mcast_device *dev; struct mcast_port *port; struct mcast_group *group; unsigned long flags; int ret = 0; dev = ib_get_client_data(device, &mcast_client); if (!dev) return -ENODEV; port = &dev->port[port_num - dev->start_port]; spin_lock_irqsave(&port->lock, flags); group = mcast_find(port, mgid); if (group) *rec = group->rec; else ret = -EADDRNOTAVAIL; spin_unlock_irqrestore(&port->lock, flags); return ret; } EXPORT_SYMBOL(ib_sa_get_mcmember_rec); /** * ib_init_ah_from_mcmember - Initialize AH attribute from multicast * member record and gid of the device. * @device: RDMA device * @port_num: Port of the rdma device to consider * @rec: Multicast member record to use * @ndev: Optional netdevice, applicable only for RoCE * @gid_type: GID type to consider * @ah_attr: AH attribute to fillup on successful completion * * ib_init_ah_from_mcmember() initializes AH attribute based on multicast * member record and other device properties. On success the caller is * responsible to call rdma_destroy_ah_attr on the ah_attr. Returns 0 on * success or appropriate error code. * */ int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, struct rdma_ah_attr *ah_attr) { const struct ib_gid_attr *sgid_attr; /* GID table is not based on the netdevice for IB link layer, * so ignore ndev during search. */ if (rdma_protocol_ib(device, port_num)) ndev = NULL; else if (!rdma_protocol_roce(device, port_num)) return -EINVAL; sgid_attr = rdma_find_gid_by_port(device, &rec->port_gid, gid_type, port_num, ndev); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); memset(ah_attr, 0, sizeof(*ah_attr)); ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid)); rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_port_num(ah_attr, port_num); rdma_ah_set_static_rate(ah_attr, rec->rate); rdma_move_grh_sgid_attr(ah_attr, &rec->mgid, be32_to_cpu(rec->flow_label), rec->hop_limit, rec->traffic_class, sgid_attr); return 0; } EXPORT_SYMBOL(ib_init_ah_from_mcmember); static void mcast_groups_event(struct mcast_port *port, enum mcast_group_state state) { struct mcast_group *group; struct rb_node *node; unsigned long flags; spin_lock_irqsave(&port->lock, flags); for (node = rb_first(&port->table); node; node = rb_next(node)) { group = rb_entry(node, struct mcast_group, node); spin_lock(&group->lock); if (group->state == MCAST_IDLE) { atomic_inc(&group->refcount); queue_work(mcast_wq, &group->work); } if (group->state != MCAST_GROUP_ERROR) group->state = state; spin_unlock(&group->lock); } spin_unlock_irqrestore(&port->lock, flags); } static void mcast_event_handler(struct ib_event_handler *handler, struct ib_event *event) { struct mcast_device *dev; int index; dev = container_of(handler, struct mcast_device, event_handler); if (!rdma_cap_ib_mcast(dev->device, event->element.port_num)) return; index = event->element.port_num - dev->start_port; switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; case IB_EVENT_PKEY_CHANGE: mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT); break; default: break; } } static int mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; int i; int count = 0; dev = kmalloc(struct_size(dev, port, device->phys_port_cnt), GFP_KERNEL); if (!dev) return -ENOMEM; dev->start_port = rdma_start_port(device); dev->end_port = rdma_end_port(device); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (!rdma_cap_ib_mcast(device, dev->start_port + i)) continue; port = &dev->port[i]; port->dev = dev; port->port_num = dev->start_port + i; spin_lock_init(&port->lock); port->table = RB_ROOT; init_completion(&port->comp); refcount_set(&port->refcount, 1); ++count; } if (!count) { kfree(dev); return -EOPNOTSUPP; } dev->device = device; ib_set_client_data(device, &mcast_client, dev); INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); return 0; } static void mcast_remove_one(struct ib_device *device, void *client_data) { struct mcast_device *dev = client_data; struct mcast_port *port; int i; ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { if (rdma_cap_ib_mcast(device, dev->start_port + i)) { port = &dev->port[i]; deref_port(port); wait_for_completion(&port->comp); } } kfree(dev); } int mcast_init(void) { int ret; mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM); if (!mcast_wq) return -ENOMEM; ib_sa_register_client(&sa_client); ret = ib_register_client(&mcast_client); if (ret) goto err; return 0; err: ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); return ret; } void mcast_cleanup(void) { ib_unregister_client(&mcast_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(mcast_wq); }
170 7 4 3 25 5 1 1 3 2319 1 1 1 1 8 8 2 58 59 1 58 2 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: page = find_lock_page(mapping, offset); if (!page) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(page); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)page_address(page); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = page; ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = desc->end - desc->start; if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len)) return -EAGAIN; desc->vm_flags |= VM_LOCKED | VM_DONTDUMP; desc->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap_prepare = secretmem_mmap_prepare, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(&folio->page); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL); if (err) { file = ERR_PTR(err); goto err_free_inode; } file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { struct file *file; int fd, err; /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; file = secretmem_file_create(flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_put_fd; } file->f_flags |= O_LARGEFILE; atomic_inc(&secretmem_users); fd_install(fd, file); return fd; err_put_fd: put_unused_fd(fd); return err; } static int secretmem_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable || !can_set_direct_map()) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; return 0; } fs_initcall(secretmem_init);
512 1484 1482 1013 512 512 293 293 210 210 210 210 45 1 43 44 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004 IBM Corporation * * Author: Serge Hallyn <serue@us.ibm.com> */ #include <linux/export.h> #include <linux/uts.h> #include <linux/utsname.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> static struct kmem_cache *uts_ns_cache __ro_after_init; static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); } static void dec_uts_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) refcount_set(&uts_ns->ns.count, 1); return uts_ns; } /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; ucounts = inc_uts_namespaces(user_ns); if (!ucounts) goto fail; err = -ENOMEM; ns = create_uts_ns(); if (!ns) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ucounts = ucounts; ns->ns.ops = &utsns_operations; down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; fail_free: kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: return ERR_PTR(err); } /* * Copy task tsk's utsname namespace, or clone it if flags * specifies CLONE_NEWUTS. In latter case, changes to the * utsname of this process won't be seen by parent, and vice * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; BUG_ON(!old_ns); get_uts_ns(old_ns); if (!(flags & CLONE_NEWUTS)) return old_ns; new_ns = clone_uts_ns(user_ns, old_ns); put_uts_ns(old_ns); return new_ns; } void free_uts_ns(struct uts_namespace *ns) { dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) { return container_of(ns, struct uts_namespace, ns); } static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void utsns_put(struct ns_common *ns) { put_uts_ns(to_uts_ns(ns)); } static int utsns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; return 0; } static struct user_namespace *utsns_owner(struct ns_common *ns) { return to_uts_ns(ns)->user_ns; } const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, .owner = utsns_owner, }; void __init uts_ns_init(void) { uts_ns_cache = kmem_cache_create_usercopy( "uts_namespace", sizeof(struct uts_namespace), 0, SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); }
1 1 1 1786 1791 2 1783 9 1560 1773 1575 1292 14 1164 9 9 24 5 1 18 6 18 12 23 24 8 15 5 8 5 5 3 18 3 2 1 1 1 5 5 1 1 1 1 4 5 4 1 5 5 4 1 1 1561 1 1559 58 58 58 175 176 176 4 4 4 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Domain Hash Table * * This file manages the domain hash table that NetLabel uses to determine * which network labeling protocol to use for a given domain. The NetLabel * system manages static and dynamic label mappings for network protocols such * as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #include <linux/types.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <net/calipso.h> #include <asm/bug.h> #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" #include "netlabel_calipso.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" struct netlbl_domhsh_tbl { struct list_head *tbl; u32 size; }; /* Domain hash table */ /* updates should be so rare that having one spinlock for the entire hash table * should be okay */ static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh; static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4; static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6; /* * Domain Hash Table Helper Functions */ /** * netlbl_domhsh_free_entry - Frees a domain hash table entry * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to a hash table entry can be released * safely. * */ static void netlbl_domhsh_free_entry(struct rcu_head *entry) { struct netlbl_dom_map *ptr; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ ptr = container_of(entry, struct netlbl_dom_map, rcu); if (ptr->def.type == NETLBL_NLTYPE_ADDRSELECT) { netlbl_af4list_foreach_safe(iter4, tmp4, &ptr->def.addrsel->list4) { netlbl_af4list_remove_entry(iter4); kfree(netlbl_domhsh_addr4_entry(iter4)); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &ptr->def.addrsel->list6) { netlbl_af6list_remove_entry(iter6); kfree(netlbl_domhsh_addr6_entry(iter6)); } #endif /* IPv6 */ kfree(ptr->def.addrsel); } kfree(ptr->domain); kfree(ptr); } /** * netlbl_domhsh_hash - Hashing function for the domain hash table * @key: the domain name to hash * * Description: * This is the hashing function for the domain hash table, it returns the * correct bucket number for the domain. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or the * hash table lock. * */ static u32 netlbl_domhsh_hash(const char *key) { u32 iter; u32 val; u32 len; /* This is taken (with slight modification) from * security/selinux/ss/symtab.c:symhash() */ for (iter = 0, val = 0, len = strlen(key); iter < len; iter++) val = (val << 4 | (val >> (8 * sizeof(u32) - 4))) ^ key[iter]; return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); } static bool netlbl_family_match(u16 f1, u16 f2) { return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC); } /** * netlbl_domhsh_search - Search for a domain entry * @domain: the domain * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table * entry if found, otherwise NULL is returned. @family may be %AF_UNSPEC * which matches any address family entries. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or the * hash table lock. * */ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, u16 family) { u32 bkt; struct list_head *bkt_list; struct netlbl_dom_map *iter; if (domain != NULL) { bkt = netlbl_domhsh_hash(domain); bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list, lockdep_is_held(&netlbl_domhsh_lock)) if (iter->valid && netlbl_family_match(iter->family, family) && strcmp(iter->domain, domain) == 0) return iter; } return NULL; } /** * netlbl_domhsh_search_def - Search for a domain entry * @domain: the domain * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table * entry if an exact match is found, if an exact match is not present in the * hash table then the default entry is returned if valid otherwise NULL is * returned. @family may be %AF_UNSPEC which matches any address family * entries. The caller is responsible ensuring that the hash table is * protected with either a RCU read lock or the hash table lock. * */ static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain, u16 family) { struct netlbl_dom_map *entry; entry = netlbl_domhsh_search(domain, family); if (entry != NULL) return entry; if (family == AF_INET || family == AF_UNSPEC) { entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4); if (entry != NULL && entry->valid) return entry; } if (family == AF_INET6 || family == AF_UNSPEC) { entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6); if (entry != NULL && entry->valid) return entry; } return NULL; } /** * netlbl_domhsh_audit_add - Generate an audit entry for an add event * @entry: the entry being added * @addr4: the IPv4 address information * @addr6: the IPv6 address information * @result: the result code * @audit_info: NetLabel audit information * * Description: * Generate an audit record for adding a new NetLabel/LSM mapping entry with * the given information. Caller is responsible for holding the necessary * locks. * */ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, struct netlbl_af4list *addr4, struct netlbl_af6list *addr6, int result, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; struct cipso_v4_doi *cipsov4 = NULL; struct calipso_doi *calipso = NULL; u32 type; audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " nlbl_domain=%s", entry->domain ? entry->domain : "(default)"); if (addr4 != NULL) { struct netlbl_domaddr4_map *map4; map4 = netlbl_domhsh_addr4_entry(addr4); type = map4->def.type; cipsov4 = map4->def.cipso; netlbl_af4list_audit_addr(audit_buf, 0, NULL, addr4->addr, addr4->mask); #if IS_ENABLED(CONFIG_IPV6) } else if (addr6 != NULL) { struct netlbl_domaddr6_map *map6; map6 = netlbl_domhsh_addr6_entry(addr6); type = map6->def.type; calipso = map6->def.calipso; netlbl_af6list_audit_addr(audit_buf, 0, NULL, &addr6->addr, &addr6->mask); #endif /* IPv6 */ } else { type = entry->def.type; cipsov4 = entry->def.cipso; calipso = entry->def.calipso; } switch (type) { case NETLBL_NLTYPE_UNLABELED: audit_log_format(audit_buf, " nlbl_protocol=unlbl"); break; case NETLBL_NLTYPE_CIPSOV4: BUG_ON(cipsov4 == NULL); audit_log_format(audit_buf, " nlbl_protocol=cipsov4 cipso_doi=%u", cipsov4->doi); break; case NETLBL_NLTYPE_CALIPSO: BUG_ON(calipso == NULL); audit_log_format(audit_buf, " nlbl_protocol=calipso calipso_doi=%u", calipso->doi); break; } audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); audit_log_end(audit_buf); } } /** * netlbl_domhsh_validate - Validate a new domain mapping entry * @entry: the entry to validate * * This function validates the new domain mapping entry to ensure that it is * a valid entry. Returns zero on success, negative values on failure. * */ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) { struct netlbl_af4list *iter4; struct netlbl_domaddr4_map *map4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_domaddr6_map *map6; #endif /* IPv6 */ if (entry == NULL) return -EINVAL; if (entry->family != AF_INET && entry->family != AF_INET6 && (entry->family != AF_UNSPEC || entry->def.type != NETLBL_NLTYPE_UNLABELED)) return -EINVAL; switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: if (entry->def.cipso != NULL || entry->def.calipso != NULL || entry->def.addrsel != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CIPSOV4: if (entry->family != AF_INET || entry->def.cipso == NULL) return -EINVAL; break; case NETLBL_NLTYPE_CALIPSO: if (entry->family != AF_INET6 || entry->def.calipso == NULL) return -EINVAL; break; case NETLBL_NLTYPE_ADDRSELECT: netlbl_af4list_foreach(iter4, &entry->def.addrsel->list4) { map4 = netlbl_domhsh_addr4_entry(iter4); switch (map4->def.type) { case NETLBL_NLTYPE_UNLABELED: if (map4->def.cipso != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CIPSOV4: if (map4->def.cipso == NULL) return -EINVAL; break; default: return -EINVAL; } } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach(iter6, &entry->def.addrsel->list6) { map6 = netlbl_domhsh_addr6_entry(iter6); switch (map6->def.type) { case NETLBL_NLTYPE_UNLABELED: if (map6->def.calipso != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CALIPSO: if (map6->def.calipso == NULL) return -EINVAL; break; default: return -EINVAL; } } #endif /* IPv6 */ break; default: return -EINVAL; } return 0; } /* * Domain Hash Table Functions */ /** * netlbl_domhsh_init - Init for the domain hash * @size: the number of bits to use for the hash buckets * * Description: * Initializes the domain hash table, should be called only by * netlbl_user_init() during initialization. Returns zero on success, non-zero * values on error. * */ int __init netlbl_domhsh_init(u32 size) { u32 iter; struct netlbl_domhsh_tbl *hsh_tbl; if (size == 0) return -EINVAL; hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL); if (hsh_tbl == NULL) return -ENOMEM; hsh_tbl->size = 1 << size; hsh_tbl->tbl = kcalloc(hsh_tbl->size, sizeof(struct list_head), GFP_KERNEL); if (hsh_tbl->tbl == NULL) { kfree(hsh_tbl); return -ENOMEM; } for (iter = 0; iter < hsh_tbl->size; iter++) INIT_LIST_HEAD(&hsh_tbl->tbl[iter]); spin_lock(&netlbl_domhsh_lock); rcu_assign_pointer(netlbl_domhsh, hsh_tbl); spin_unlock(&netlbl_domhsh_lock); return 0; } /** * netlbl_domhsh_add - Adds a entry to the domain hash table * @entry: the entry to add * @audit_info: NetLabel audit information * * Description: * Adds a new entry to the domain hash table and handles any updates to the * lower level protocol handler (i.e. CIPSO). @entry->family may be set to * %AF_UNSPEC which will add an entry that matches all address families. This * is only useful for the unlabelled type and will only succeed if there is no * existing entry for any address family with the same domain. Returns zero * on success, negative on failure. * */ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { int ret_val = 0; struct netlbl_dom_map *entry_old, *entry_b; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_af6list *tmp6; #endif /* IPv6 */ ret_val = netlbl_domhsh_validate(entry); if (ret_val != 0) return ret_val; /* XXX - we can remove this RCU read lock as the spinlock protects the * entire function, but before we do we need to fixup the * netlbl_af[4,6]list RCU functions to do "the right thing" with * respect to rcu_dereference() when only a spinlock is held. */ rcu_read_lock(); spin_lock(&netlbl_domhsh_lock); if (entry->domain != NULL) entry_old = netlbl_domhsh_search(entry->domain, entry->family); else entry_old = netlbl_domhsh_search_def(entry->domain, entry->family); if (entry_old == NULL) { entry->valid = 1; if (entry->domain != NULL) { u32 bkt = netlbl_domhsh_hash(entry->domain); list_add_tail_rcu(&entry->list, &rcu_dereference(netlbl_domhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&entry->list); switch (entry->family) { case AF_INET: rcu_assign_pointer(netlbl_domhsh_def_ipv4, entry); break; case AF_INET6: rcu_assign_pointer(netlbl_domhsh_def_ipv6, entry); break; case AF_UNSPEC: if (entry->def.type != NETLBL_NLTYPE_UNLABELED) { ret_val = -EINVAL; goto add_return; } entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC); if (entry_b == NULL) { ret_val = -ENOMEM; goto add_return; } entry_b->family = AF_INET6; entry_b->def.type = NETLBL_NLTYPE_UNLABELED; entry_b->valid = 1; entry->family = AF_INET; rcu_assign_pointer(netlbl_domhsh_def_ipv4, entry); rcu_assign_pointer(netlbl_domhsh_def_ipv6, entry_b); break; default: /* Already checked in * netlbl_domhsh_validate(). */ ret_val = -EINVAL; goto add_return; } } if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) { netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) netlbl_domhsh_audit_add(entry, iter4, NULL, ret_val, audit_info); #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) netlbl_domhsh_audit_add(entry, NULL, iter6, ret_val, audit_info); #endif /* IPv6 */ } else netlbl_domhsh_audit_add(entry, NULL, NULL, ret_val, audit_info); } else if (entry_old->def.type == NETLBL_NLTYPE_ADDRSELECT && entry->def.type == NETLBL_NLTYPE_ADDRSELECT) { struct list_head *old_list4; struct list_head *old_list6; old_list4 = &entry_old->def.addrsel->list4; old_list6 = &entry_old->def.addrsel->list6; /* we only allow the addition of address selectors if all of * the selectors do not exist in the existing domain map */ netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) if (netlbl_af4list_search_exact(iter4->addr, iter4->mask, old_list4)) { ret_val = -EEXIST; goto add_return; } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) if (netlbl_af6list_search_exact(&iter6->addr, &iter6->mask, old_list6)) { ret_val = -EEXIST; goto add_return; } #endif /* IPv6 */ netlbl_af4list_foreach_safe(iter4, tmp4, &entry->def.addrsel->list4) { netlbl_af4list_remove_entry(iter4); iter4->valid = 1; ret_val = netlbl_af4list_add(iter4, old_list4); netlbl_domhsh_audit_add(entry_old, iter4, NULL, ret_val, audit_info); if (ret_val != 0) goto add_return; } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_safe(iter6, tmp6, &entry->def.addrsel->list6) { netlbl_af6list_remove_entry(iter6); iter6->valid = 1; ret_val = netlbl_af6list_add(iter6, old_list6); netlbl_domhsh_audit_add(entry_old, NULL, iter6, ret_val, audit_info); if (ret_val != 0) goto add_return; } #endif /* IPv6 */ /* cleanup the new entry since we've moved everything over */ netlbl_domhsh_free_entry(&entry->rcu); } else ret_val = -EINVAL; add_return: spin_unlock(&netlbl_domhsh_lock); rcu_read_unlock(); return ret_val; } /** * netlbl_domhsh_add_default - Adds the default entry to the domain hash table * @entry: the entry to add * @audit_info: NetLabel audit information * * Description: * Adds a new default entry to the domain hash table and handles any updates * to the lower level protocol handler (i.e. CIPSO). Returns zero on success, * negative on failure. * */ int netlbl_domhsh_add_default(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { return netlbl_domhsh_add(entry, audit_info); } /** * netlbl_domhsh_remove_entry - Removes a given entry from the domain table * @entry: the entry to remove * @audit_info: NetLabel audit information * * Description: * Removes an entry from the domain hash table and handles any updates to the * lower level protocol handler (i.e. CIPSO). Caller is responsible for * ensuring that the RCU read lock is held. Returns zero on success, negative * on failure. * */ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { int ret_val = 0; struct audit_buffer *audit_buf; struct netlbl_af4list *iter4; struct netlbl_domaddr4_map *map4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; struct netlbl_domaddr6_map *map6; #endif /* IPv6 */ if (entry == NULL) return -ENOENT; spin_lock(&netlbl_domhsh_lock); if (entry->valid) { entry->valid = 0; if (entry == rcu_dereference(netlbl_domhsh_def_ipv4)) RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL); else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6)) RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL); else list_del_rcu(&entry->list); } else ret_val = -ENOENT; spin_unlock(&netlbl_domhsh_lock); if (ret_val) return ret_val; audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_DEL, audit_info); if (audit_buf != NULL) { audit_log_format(audit_buf, " nlbl_domain=%s res=1", entry->domain ? entry->domain : "(default)"); audit_log_end(audit_buf); } switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: netlbl_af4list_foreach_rcu(iter4, &entry->def.addrsel->list4) { map4 = netlbl_domhsh_addr4_entry(iter4); cipso_v4_doi_putdef(map4->def.cipso); } #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry->def.addrsel->list6) { map6 = netlbl_domhsh_addr6_entry(iter6); calipso_doi_putdef(map6->def.calipso); } #endif /* IPv6 */ break; case NETLBL_NLTYPE_CIPSOV4: cipso_v4_doi_putdef(entry->def.cipso); break; #if IS_ENABLED(CONFIG_IPV6) case NETLBL_NLTYPE_CALIPSO: calipso_doi_putdef(entry->def.calipso); break; #endif /* IPv6 */ } call_rcu(&entry->rcu, netlbl_domhsh_free_entry); return ret_val; } /** * netlbl_domhsh_remove_af4 - Removes an address selector entry * @domain: the domain * @addr: IPv4 address * @mask: IPv4 address mask * @audit_info: NetLabel audit information * * Description: * Removes an individual address selector from a domain mapping and potentially * the entire mapping if it is empty. Returns zero on success, negative values * on failure. * */ int netlbl_domhsh_remove_af4(const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_dom_map *entry_map; struct netlbl_af4list *entry_addr; struct netlbl_af4list *iter4; #if IS_ENABLED(CONFIG_IPV6) struct netlbl_af6list *iter6; #endif /* IPv6 */ struct netlbl_domaddr4_map *entry; rcu_read_lock(); if (domain) entry_map = netlbl_domhsh_search(domain, AF_INET); else entry_map = netlbl_domhsh_search_def(domain, AF_INET); if (entry_map == NULL || entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) goto remove_af4_failure; spin_lock(&netlbl_domhsh_lock); entry_addr = netlbl_af4list_remove(addr->s_addr, mask->s_addr, &entry_map->def.addrsel->list4); spin_unlock(&netlbl_domhsh_lock); if (entry_addr == NULL) goto remove_af4_failure; netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4) goto remove_af4_single_addr; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6) goto remove_af4_single_addr; #endif /* IPv6 */ /* the domain mapping is empty so remove it from the mapping table */ netlbl_domhsh_remove_entry(entry_map, audit_info); remove_af4_single_addr: rcu_read_unlock(); /* yick, we can't use call_rcu here because we don't have a rcu head * pointer but hopefully this should be a rare case so the pause * shouldn't be a problem */ synchronize_rcu(); entry = netlbl_domhsh_addr4_entry(entry_addr); cipso_v4_doi_putdef(entry->def.cipso); kfree(entry); return 0; remove_af4_failure: rcu_read_unlock(); return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_domhsh_remove_af6 - Removes an address selector entry * @domain: the domain * @addr: IPv6 address * @mask: IPv6 address mask * @audit_info: NetLabel audit information * * Description: * Removes an individual address selector from a domain mapping and potentially * the entire mapping if it is empty. Returns zero on success, negative values * on failure. * */ int netlbl_domhsh_remove_af6(const char *domain, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { struct netlbl_dom_map *entry_map; struct netlbl_af6list *entry_addr; struct netlbl_af4list *iter4; struct netlbl_af6list *iter6; struct netlbl_domaddr6_map *entry; rcu_read_lock(); if (domain) entry_map = netlbl_domhsh_search(domain, AF_INET6); else entry_map = netlbl_domhsh_search_def(domain, AF_INET6); if (entry_map == NULL || entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) goto remove_af6_failure; spin_lock(&netlbl_domhsh_lock); entry_addr = netlbl_af6list_remove(addr, mask, &entry_map->def.addrsel->list6); spin_unlock(&netlbl_domhsh_lock); if (entry_addr == NULL) goto remove_af6_failure; netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4) goto remove_af6_single_addr; netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6) goto remove_af6_single_addr; /* the domain mapping is empty so remove it from the mapping table */ netlbl_domhsh_remove_entry(entry_map, audit_info); remove_af6_single_addr: rcu_read_unlock(); /* yick, we can't use call_rcu here because we don't have a rcu head * pointer but hopefully this should be a rare case so the pause * shouldn't be a problem */ synchronize_rcu(); entry = netlbl_domhsh_addr6_entry(entry_addr); calipso_doi_putdef(entry->def.calipso); kfree(entry); return 0; remove_af6_failure: rcu_read_unlock(); return -ENOENT; } #endif /* IPv6 */ /** * netlbl_domhsh_remove - Removes an entry from the domain hash table * @domain: the domain to remove * @family: address family * @audit_info: NetLabel audit information * * Description: * Removes an entry from the domain hash table and handles any updates to the * lower level protocol handler (i.e. CIPSO). @family may be %AF_UNSPEC which * removes all address family entries. Returns zero on success, negative on * failure. * */ int netlbl_domhsh_remove(const char *domain, u16 family, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; struct netlbl_dom_map *entry; rcu_read_lock(); if (family == AF_INET || family == AF_UNSPEC) { if (domain) entry = netlbl_domhsh_search(domain, AF_INET); else entry = netlbl_domhsh_search_def(domain, AF_INET); ret_val = netlbl_domhsh_remove_entry(entry, audit_info); if (ret_val && ret_val != -ENOENT) goto done; } if (family == AF_INET6 || family == AF_UNSPEC) { int ret_val2; if (domain) entry = netlbl_domhsh_search(domain, AF_INET6); else entry = netlbl_domhsh_search_def(domain, AF_INET6); ret_val2 = netlbl_domhsh_remove_entry(entry, audit_info); if (ret_val2 != -ENOENT) ret_val = ret_val2; } done: rcu_read_unlock(); return ret_val; } /** * netlbl_domhsh_remove_default - Removes the default entry from the table * @family: address family * @audit_info: NetLabel audit information * * Description: * Removes/resets the default entry corresponding to @family from the domain * hash table and handles any updates to the lower level protocol handler * (i.e. CIPSO). @family may be %AF_UNSPEC which removes all address family * entries. Returns zero on success, negative on failure. * */ int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info) { return netlbl_domhsh_remove(NULL, family, audit_info); } /** * netlbl_domhsh_getentry - Get an entry from the domain hash table * @domain: the domain name to search for * @family: address family * * Description: * Look through the domain hash table searching for an entry to match @domain, * with address family @family, return a pointer to a copy of the entry or * NULL. The caller is responsible for ensuring that rcu_read_[un]lock() is * called. * */ struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family) { if (family == AF_UNSPEC) return NULL; return netlbl_domhsh_search_def(domain, family); } /** * netlbl_domhsh_getentry_af4 - Get an entry from the domain hash table * @domain: the domain name to search for * @addr: the IP address to search for * * Description: * Look through the domain hash table searching for an entry to match @domain * and @addr, return a pointer to a copy of the entry or NULL. The caller is * responsible for ensuring that rcu_read_[un]lock() is called. * */ struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain, __be32 addr) { struct netlbl_dom_map *dom_iter; struct netlbl_af4list *addr_iter; dom_iter = netlbl_domhsh_search_def(domain, AF_INET); if (dom_iter == NULL) return NULL; if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT) return &dom_iter->def; addr_iter = netlbl_af4list_search(addr, &dom_iter->def.addrsel->list4); if (addr_iter == NULL) return NULL; return &(netlbl_domhsh_addr4_entry(addr_iter)->def); } #if IS_ENABLED(CONFIG_IPV6) /** * netlbl_domhsh_getentry_af6 - Get an entry from the domain hash table * @domain: the domain name to search for * @addr: the IP address to search for * * Description: * Look through the domain hash table searching for an entry to match @domain * and @addr, return a pointer to a copy of the entry or NULL. The caller is * responsible for ensuring that rcu_read_[un]lock() is called. * */ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, const struct in6_addr *addr) { struct netlbl_dom_map *dom_iter; struct netlbl_af6list *addr_iter; dom_iter = netlbl_domhsh_search_def(domain, AF_INET6); if (dom_iter == NULL) return NULL; if (dom_iter->def.type != NETLBL_NLTYPE_ADDRSELECT) return &dom_iter->def; addr_iter = netlbl_af6list_search(addr, &dom_iter->def.addrsel->list6); if (addr_iter == NULL) return NULL; return &(netlbl_domhsh_addr6_entry(addr_iter)->def); } #endif /* IPv6 */ /** * netlbl_domhsh_walk - Iterate through the domain mapping hash table * @skip_bkt: the number of buckets to skip at the start * @skip_chain: the number of entries to skip in the first iterated bucket * @callback: callback for each entry * @cb_arg: argument for the callback function * * Description: * Iterate over the domain mapping hash table, skipping the first @skip_bkt * buckets and @skip_chain entries. For each entry in the table call * @callback, if @callback returns a negative value stop 'walking' through the * table and return. Updates the values in @skip_bkt and @skip_chain on * return. Returns zero on success, negative values on failure. * */ int netlbl_domhsh_walk(u32 *skip_bkt, u32 *skip_chain, int (*callback) (struct netlbl_dom_map *entry, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 iter_bkt; struct list_head *iter_list; struct netlbl_dom_map *iter_entry; u32 chain_cnt = 0; rcu_read_lock(); for (iter_bkt = *skip_bkt; iter_bkt < rcu_dereference(netlbl_domhsh)->size; iter_bkt++, chain_cnt = 0) { iter_list = &rcu_dereference(netlbl_domhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iter_entry, iter_list, list) if (iter_entry->valid) { if (chain_cnt++ < *skip_chain) continue; ret_val = callback(iter_entry, cb_arg); if (ret_val < 0) { chain_cnt--; goto walk_return; } } } walk_return: rcu_read_unlock(); *skip_bkt = iter_bkt; *skip_chain = chain_cnt; return ret_val; }
182 24726 179 1459 1290 178 333 728 27521 423 1334 24838 1524 728 333 27656 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* I/O iterator iteration building functions. * * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_IOV_ITER_H #define _LINUX_IOV_ITER_H #include <linux/uio.h> #include <linux/bvec.h> #include <linux/folio_queue.h> typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len, void *priv, void *priv2); /* * Handle ITER_UBUF. */ static __always_inline size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { void __user *base = iter->ubuf; size_t progress = 0, remain; remain = step(base + iter->iov_offset, 0, len, priv, priv2); progress = len - remain; iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_IOVEC. */ static __always_inline size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f step) { const struct iovec *p = iter->__iov; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->__iov; iter->__iov = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_KVEC. */ static __always_inline size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct kvec *p = iter->kvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t part = min(len, p->iov_len - skip); if (likely(part)) { remain = step(p->iov_base + skip, progress, part, priv, priv2); consumed = part - remain; progress += consumed; skip += consumed; len -= consumed; if (skip < p->iov_len) break; } p++; skip = 0; } while (len); iter->nr_segs -= p - iter->kvec; iter->kvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_BVEC. */ static __always_inline size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct bio_vec *p = iter->bvec; size_t progress = 0, skip = iter->iov_offset; do { size_t remain, consumed; size_t offset = p->bv_offset + skip, part; void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE); part = min3(len, (size_t)(p->bv_len - skip), (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2); kunmap_local(kaddr); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; if (skip >= p->bv_len) { skip = 0; p++; } if (remain) break; } while (len); iter->nr_segs -= p - iter->bvec; iter->bvec = p; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_FOLIOQ. */ static __always_inline size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { const struct folio_queue *folioq = iter->folioq; unsigned int slot = iter->folioq_slot; size_t progress = 0, skip = iter->iov_offset; if (slot == folioq_nr_slots(folioq)) { /* The iterator may have been extended. */ folioq = folioq->next; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t part, remain, consumed; size_t fsize; void *base; if (!folio) break; fsize = folioq_folio_size(folioq, slot); base = kmap_local_folio(folio, skip); part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; len -= consumed; progress += consumed; skip += consumed; if (skip >= fsize) { skip = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } if (remain) break; } while (len); iter->folioq_slot = slot; iter->folioq = folioq; iter->iov_offset = skip; iter->count -= progress; return progress; } /* * Handle ITER_XARRAY. */ static __always_inline size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { struct folio *folio; size_t progress = 0; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; XA_STATE(xas, iter->xarray, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { size_t remain, consumed, offset, part, flen; if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start + progress); flen = min(folio_size(folio) - offset, len); while (flen) { void *base = kmap_local_folio(folio, offset); part = min_t(size_t, flen, PAGE_SIZE - offset_in_page(offset)); remain = step(base, progress, part, priv, priv2); kunmap_local(base); consumed = part - remain; progress += consumed; len -= consumed; if (remain || len == 0) goto out; flen -= consumed; offset += consumed; } } out: rcu_read_unlock(); iter->iov_offset += progress; iter->count -= progress; return progress; } /* * Handle ITER_DISCARD. */ static __always_inline size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { size_t progress = len; iter->count -= progress; return progress; } /** * iterate_and_advance2 - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * Two step functions, @step and @ustep, must be provided, one for handling * mapped kernel addresses and the other is given user addresses which have the * potential to fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_ustep_f ustep, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (likely(iter_is_ubuf(iter))) return iterate_ubuf(iter, len, priv, priv2, ustep); if (likely(iter_is_iovec(iter))) return iterate_iovec(iter, len, priv, priv2, ustep); if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } /** * iterate_and_advance - Iterate over an iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @ustep: Function for UBUF/IOVEC iterators; given __user addresses. * @step: Function for other iterators; given kernel addresses. * * As iterate_and_advance2(), but priv2 is always NULL. */ static __always_inline size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, iov_ustep_f ustep, iov_step_f step) { return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } /** * iterate_and_advance_kernel - Iterate over a kernel-internal iterator * @iter: The iterator to iterate over. * @len: The amount to iterate over. * @priv: Data for the step functions. * @priv2: More data for the step functions. * @step: Function for other iterators; given kernel addresses. * * Iterate over the next part of an iterator, up to the specified length. The * buffer is presented in segments, which for kernel iteration are broken up by * physical pages and mapped, with the mapped address being presented. * * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type * iterators; it will not handle UBUF or IOVEC-type iterators. * * A step functions, @step, must be provided, one for handling mapped kernel * addresses and the other is given user addresses which have the potential to * fault since no pinning is performed. * * The step functions are passed the address and length of the segment, @priv, * @priv2 and the amount of data so far iterated over (which can, for example, * be added to @priv to point to the right part of a second buffer). The step * functions should return the amount of the segment they didn't process (ie. 0 * indicates complete processsing). * * This function returns the amount of data processed (ie. 0 means nothing was * processed and the value of @len means processes to completion). */ static __always_inline size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, void *priv2, iov_step_f step) { if (unlikely(iter->count < len)) len = iter->count; if (unlikely(!len)) return 0; if (iov_iter_is_bvec(iter)) return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); if (iov_iter_is_folioq(iter)) return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); } #endif /* _LINUX_IOV_ITER_H */
68 66 49 66 66 68 68 68 67 68 68 68 66 49 49 49 66 66 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation */ #include <crypto/curve25519.h> #include <crypto/internal/kpp.h> #include <linux/types.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <asm/cpufeature.h> #include <asm/processor.h> static __always_inline u64 eq_mask(u64 a, u64 b) { u64 x = a ^ b; u64 minus_x = ~x + (u64)1U; u64 x_or_minus_x = x | minus_x; u64 xnx = x_or_minus_x >> (u32)63U; return xnx - (u64)1U; } static __always_inline u64 gte_mask(u64 a, u64 b) { u64 x = a; u64 y = b; u64 x_xor_y = x ^ y; u64 x_sub_y = x - y; u64 x_sub_y_xor_y = x_sub_y ^ y; u64 q = x_xor_y | x_sub_y_xor_y; u64 x_xor_q = x ^ q; u64 x_xor_q_ = x_xor_q >> (u32)63U; return x_xor_q_ - (u64)1U; } /* Computes the addition of four-element f1 with value in f2 * and returns the carry (if any) */ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) { u64 carry_r; asm volatile( /* Clear registers to propagate the carry bit */ " xor %%r8d, %%r8d;" " xor %%r9d, %%r9d;" " xor %%r10d, %%r10d;" " xor %%r11d, %%r11d;" " xor %k1, %k1;" /* Begin addition chain */ " addq 0(%3), %0;" " movq %0, 0(%2);" " adcxq 8(%3), %%r8;" " movq %%r8, 8(%2);" " adcxq 16(%3), %%r9;" " movq %%r9, 16(%2);" " adcxq 24(%3), %%r10;" " movq %%r10, 24(%2);" /* Return the carry bit in a register */ " adcx %%r11, %1;" : "+&r"(f2), "=&r"(carry_r) : "r"(out), "r"(f1) : "%r8", "%r9", "%r10", "%r11", "memory", "cc"); return carry_r; } /* Computes the field addition of two field elements */ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( /* Compute the raw addition of f1 + f2 */ " movq 0(%0), %%r8;" " addq 0(%2), %%r8;" " movq 8(%0), %%r9;" " adcxq 8(%2), %%r9;" " movq 16(%0), %%r10;" " adcxq 16(%2), %%r10;" " movq 24(%0), %%r11;" " adcxq 24(%2), %%r11;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %0;" " cmovc %0, %%rax;" /* Step 2: Add carry*38 to the original sum */ " xor %%ecx, %%ecx;" " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %0, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r"(f2) : "r"(out), "r"(f1) : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } /* Computes the field subtraction of two field elements */ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( /* Compute the raw subtraction of f1-f2 */ " movq 0(%1), %%r8;" " subq 0(%2), %%r8;" " movq 8(%1), %%r9;" " sbbq 8(%2), %%r9;" " movq 16(%1), %%r10;" " sbbq 16(%2), %%r10;" " movq 24(%1), %%r11;" " sbbq 24(%2), %%r11;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %%rcx;" " cmovc %%rcx, %%rax;" /* Step 2: Subtract carry*38 from the original difference */ " sub %%rax, %%r8;" " sbb $0, %%r9;" " sbb $0, %%r10;" " sbb $0, %%r11;" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rcx, %%rax;" " sub %%rax, %%r8;" /* Store the result */ " movq %%r8, 0(%0);" " movq %%r9, 8(%0);" " movq %%r10, 16(%0);" " movq %%r11, 24(%0);" : : "r"(out), "r"(f1), "r"(f2) : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } /* Computes a field multiplication: out <- f1 * f2 * Uses the 8-element buffer tmp for intermediate results */ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( /* Compute the raw multiplication: tmp <- src1 * src2 */ /* Compute src1[0] * src2 */ " movq 0(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%2), %%r8;" " movq %%r8, 8(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%2), %%r8;" " movq %%r8, 16(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%2), %%r8;" " movq %%r8, 24(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%2);" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%2);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%2);" /* Line up pointers */ " mov %2, %0;" " mov %3, %2;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %k1, %k1;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %1, %%rax;" " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %1, %%r9;" " movq %%r9, 8(%2);" " adcx %1, %%r10;" " movq %%r10, 16(%2);" " adcx %1, %%r11;" " movq %%r11, 24(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%2);" : "+&r"(f1), "+&r"(f2), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc"); } /* Computes two field multiplications: * out[0] <- f1[0] * f2[0] * out[1] <- f1[1] * f2[1] * Uses the 16-element buffer tmp for intermediate results: */ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ /* Compute src1[0] * src2 */ " movq 0(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%2), %%r8;" " movq %%r8, 8(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%2), %%r8;" " movq %%r8, 16(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%0), %%rdx;" " mulxq 0(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%2), %%r8;" " movq %%r8, 24(%2);" " mulxq 8(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%2);" " mulxq 16(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%2);" " mov $0, %%r8;" " mulxq 24(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%2);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%2);" /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ /* Compute src1[0] * src2 */ " movq 32(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 40(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%2), %%r8;" " movq %%r8, 72(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 48(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%2), %%r8;" " movq %%r8, 80(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 56(%0), %%rdx;" " mulxq 32(%1), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%2), %%r8;" " movq %%r8, 88(%2);" " mulxq 40(%1), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%2);" " mulxq 48(%1), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%2);" " mov $0, %%r8;" " mulxq 56(%1), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%2);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%2);" /* Line up pointers */ " mov %2, %0;" " mov %3, %2;" /* Wrap the results back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %k1, %k1;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %1, %%rax;" " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %1, %%r9;" " movq %%r9, 8(%2);" " adcx %1, %%r10;" " movq %%r10, 16(%2);" " adcx %1, %%r11;" " movq %%r11, 24(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%2);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%0), %%r8, %%r13;" " xor %k1, %k1;" " adoxq 64(%0), %%r8;" " mulxq 104(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 72(%0), %%r9;" " mulxq 112(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 80(%0), %%r10;" " mulxq 120(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 88(%0), %%r11;" " adcx %1, %%rax;" " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %1, %%r9;" " movq %%r9, 40(%2);" " adcx %1, %%r10;" " movq %%r10, 48(%2);" " adcx %1, %%r11;" " movq %%r11, 56(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 32(%2);" : "+&r"(f1), "+&r"(f2), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc"); } /* Computes the field multiplication of four-element f1 with value in f2 * Requires f2 to be smaller than 2^17 */ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) { register u64 f2_r asm("rdx") = f2; asm volatile( /* Compute the raw multiplication of f1*f2 */ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ " add %%rcx, %%r9;" " mov $0, %%rcx;" " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ " adcx %%rbx, %%r10;" " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ " adcx %%r13, %%r11;" " adcx %%rcx, %%rax;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $38, %%rdx;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r"(f2_r) : "r"(out), "r"(f1) : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", "memory", "cc"); } /* Computes p1 <- bit ? p2 : p1 in constant time */ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) { asm volatile( /* Transfer bit into CF flag */ " add $18446744073709551615, %0;" /* cswap p1[0], p2[0] */ " movq 0(%1), %%r8;" " movq 0(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 0(%1);" " movq %%r9, 0(%2);" /* cswap p1[1], p2[1] */ " movq 8(%1), %%r8;" " movq 8(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 8(%1);" " movq %%r9, 8(%2);" /* cswap p1[2], p2[2] */ " movq 16(%1), %%r8;" " movq 16(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 16(%1);" " movq %%r9, 16(%2);" /* cswap p1[3], p2[3] */ " movq 24(%1), %%r8;" " movq 24(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 24(%1);" " movq %%r9, 24(%2);" /* cswap p1[4], p2[4] */ " movq 32(%1), %%r8;" " movq 32(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 32(%1);" " movq %%r9, 32(%2);" /* cswap p1[5], p2[5] */ " movq 40(%1), %%r8;" " movq 40(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 40(%1);" " movq %%r9, 40(%2);" /* cswap p1[6], p2[6] */ " movq 48(%1), %%r8;" " movq 48(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 48(%1);" " movq %%r9, 48(%2);" /* cswap p1[7], p2[7] */ " movq 56(%1), %%r8;" " movq 56(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 56(%1);" " movq %%r9, 56(%2);" : "+&r"(bit) : "r"(p1), "r"(p2) : "%r8", "%r9", "%r10", "memory", "cc"); } /* Computes the square of a field element: out <- f * f * Uses the 8-element buffer tmp for intermediate results */ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) { asm volatile( /* Compute the raw multiplication: tmp <- f * f */ /* Step 1: Compute all partial products */ " movq 0(%0), %%rdx;" /* f[0] */ " mulxq 8(%0), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 16(%0), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%0), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%0), %%rdx;" /* f[3] */ " mulxq 8(%0), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 16(%0), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ " movq 8(%0), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 16(%0), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%rbx;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%rbx, %%rbx;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 0(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%1);" " add %%rcx, %%r8;" " movq %%r8, 8(%1);" " movq 8(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%1);" " movq 16(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%1);" " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%1);" " movq 24(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%1);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%1);" /* Line up pointers */ " mov %1, %0;" " mov %2, %1;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r"(f), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc"); } /* Computes two field squarings: * out[0] <- f[0] * f[0] * out[1] <- f[1] * f[1] * Uses the 16-element buffer tmp for intermediate results */ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) { asm volatile( /* Step 1: Compute all partial products */ " movq 0(%0), %%rdx;" /* f[0] */ " mulxq 8(%0), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 16(%0), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%0), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%0), %%rdx;" /* f[3] */ " mulxq 8(%0), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 16(%0), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ " movq 8(%0), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 16(%0), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%rbx;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%rbx, %%rbx;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 0(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%1);" " add %%rcx, %%r8;" " movq %%r8, 8(%1);" " movq 8(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%1);" " movq 16(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%1);" " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%1);" " movq 24(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%1);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%1);" /* Step 1: Compute all partial products */ " movq 32(%0), %%rdx;" /* f[0] */ " mulxq 40(%0), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ " mulxq 48(%0), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 56(%0), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 56(%0), %%rdx;" /* f[3] */ " mulxq 40(%0), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 48(%0), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ " movq 40(%0), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 48(%0), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%rbx;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%rbx, %%rbx;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 32(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 64(%1);" " add %%rcx, %%r8;" " movq %%r8, 72(%1);" " movq 40(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 80(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 88(%1);" " movq 48(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 96(%1);" " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%1);" " movq 56(%0), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 112(%1);" " adcx %%rcx, %%r14;" " movq %%r14, 120(%1);" /* Line up pointers */ " mov %1, %0;" " mov %2, %1;" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" " adoxq 0(%0), %%r8;" " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 8(%0), %%r9;" " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 16(%0), %%r10;" " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" " adoxq 64(%0), %%r8;" " mulxq 104(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" " adoxq 72(%0), %%r9;" " mulxq 112(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" " adoxq 80(%0), %%r10;" " mulxq 120(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 88(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 40(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 48(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 56(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 32(%1);" : "+&r"(f), "+&r"(tmp) : "r"(out) : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc"); } static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2) { u64 *nq = p01_tmp1; u64 *nq_p1 = p01_tmp1 + (u32)8U; u64 *tmp1 = p01_tmp1 + (u32)16U; u64 *x1 = q; u64 *x2 = nq; u64 *z2 = nq + (u32)4U; u64 *z3 = nq_p1 + (u32)4U; u64 *a = tmp1; u64 *b = tmp1 + (u32)4U; u64 *ab = tmp1; u64 *dc = tmp1 + (u32)8U; u64 *x3; u64 *z31; u64 *d0; u64 *c0; u64 *a1; u64 *b1; u64 *d; u64 *c; u64 *ab1; u64 *dc1; fadd(a, x2, z2); fsub(b, x2, z2); x3 = nq_p1; z31 = nq_p1 + (u32)4U; d0 = dc; c0 = dc + (u32)4U; fadd(c0, x3, z31); fsub(d0, x3, z31); fmul2(dc, dc, ab, tmp2); fadd(x3, d0, c0); fsub(z31, d0, c0); a1 = tmp1; b1 = tmp1 + (u32)4U; d = tmp1 + (u32)8U; c = tmp1 + (u32)12U; ab1 = tmp1; dc1 = tmp1 + (u32)8U; fsqr2(dc1, ab1, tmp2); fsqr2(nq_p1, nq_p1, tmp2); a1[0U] = c[0U]; a1[1U] = c[1U]; a1[2U] = c[2U]; a1[3U] = c[3U]; fsub(c, d, c); fmul_scalar(b1, c, (u64)121665U); fadd(b1, b1, d); fmul2(nq, dc1, ab1, tmp2); fmul(z3, z3, x1, tmp2); } static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2) { u64 *x2 = nq; u64 *z2 = nq + (u32)4U; u64 *a = tmp1; u64 *b = tmp1 + (u32)4U; u64 *d = tmp1 + (u32)8U; u64 *c = tmp1 + (u32)12U; u64 *ab = tmp1; u64 *dc = tmp1 + (u32)8U; fadd(a, x2, z2); fsub(b, x2, z2); fsqr2(dc, ab, tmp2); a[0U] = c[0U]; a[1U] = c[1U]; a[2U] = c[2U]; a[3U] = c[3U]; fsub(c, d, c); fmul_scalar(b, c, (u64)121665U); fadd(b, b, d); fmul2(nq, dc, ab, tmp2); } static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1) { u64 tmp2[16U] = { 0U }; u64 p01_tmp1_swap[33U] = { 0U }; u64 *p0 = p01_tmp1_swap; u64 *p01 = p01_tmp1_swap; u64 *p03 = p01; u64 *p11 = p01 + (u32)8U; u64 *x0; u64 *z0; u64 *p01_tmp1; u64 *p01_tmp11; u64 *nq10; u64 *nq_p11; u64 *swap1; u64 sw0; u64 *nq1; u64 *tmp1; memcpy(p11, init1, (u32)8U * sizeof(init1[0U])); x0 = p03; z0 = p03 + (u32)4U; x0[0U] = (u64)1U; x0[1U] = (u64)0U; x0[2U] = (u64)0U; x0[3U] = (u64)0U; z0[0U] = (u64)0U; z0[1U] = (u64)0U; z0[2U] = (u64)0U; z0[3U] = (u64)0U; p01_tmp1 = p01_tmp1_swap; p01_tmp11 = p01_tmp1_swap; nq10 = p01_tmp1_swap; nq_p11 = p01_tmp1_swap + (u32)8U; swap1 = p01_tmp1_swap + (u32)32U; cswap2((u64)1U, nq10, nq_p11); point_add_and_double(init1, p01_tmp11, tmp2); swap1[0U] = (u64)1U; { u32 i; for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) { u64 *p01_tmp12 = p01_tmp1_swap; u64 *swap2 = p01_tmp1_swap + (u32)32U; u64 *nq2 = p01_tmp12; u64 *nq_p12 = p01_tmp12 + (u32)8U; u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U); u64 sw = swap2[0U] ^ bit; cswap2(sw, nq2, nq_p12); point_add_and_double(init1, p01_tmp12, tmp2); swap2[0U] = bit; } } sw0 = swap1[0U]; cswap2(sw0, nq10, nq_p11); nq1 = p01_tmp1; tmp1 = p01_tmp1 + (u32)16U; point_double(nq1, tmp1, tmp2); point_double(nq1, tmp1, tmp2); point_double(nq1, tmp1, tmp2); memcpy(out, p0, (u32)8U * sizeof(p0[0U])); memzero_explicit(tmp2, sizeof(tmp2)); memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap)); } static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1) { u32 i; fsqr(o, inp, tmp); for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U) fsqr(o, o, tmp); } static void finv(u64 *o, const u64 *i, u64 *tmp) { u64 t1[16U] = { 0U }; u64 *a0 = t1; u64 *b = t1 + (u32)4U; u64 *c = t1 + (u32)8U; u64 *t00 = t1 + (u32)12U; u64 *tmp1 = tmp; u64 *a; u64 *t0; fsquare_times(a0, i, tmp1, (u32)1U); fsquare_times(t00, a0, tmp1, (u32)2U); fmul(b, t00, i, tmp); fmul(a0, b, a0, tmp); fsquare_times(t00, a0, tmp1, (u32)1U); fmul(b, t00, b, tmp); fsquare_times(t00, b, tmp1, (u32)5U); fmul(b, t00, b, tmp); fsquare_times(t00, b, tmp1, (u32)10U); fmul(c, t00, b, tmp); fsquare_times(t00, c, tmp1, (u32)20U); fmul(t00, t00, c, tmp); fsquare_times(t00, t00, tmp1, (u32)10U); fmul(b, t00, b, tmp); fsquare_times(t00, b, tmp1, (u32)50U); fmul(c, t00, b, tmp); fsquare_times(t00, c, tmp1, (u32)100U); fmul(t00, t00, c, tmp); fsquare_times(t00, t00, tmp1, (u32)50U); fmul(t00, t00, b, tmp); fsquare_times(t00, t00, tmp1, (u32)5U); a = t1; t0 = t1 + (u32)12U; fmul(o, t0, a, tmp); } static void store_felem(u64 *b, u64 *f) { u64 f30 = f[3U]; u64 top_bit0 = f30 >> (u32)63U; u64 f31; u64 top_bit; u64 f0; u64 f1; u64 f2; u64 f3; u64 m0; u64 m1; u64 m2; u64 m3; u64 mask; u64 f0_; u64 f1_; u64 f2_; u64 f3_; u64 o0; u64 o1; u64 o2; u64 o3; f[3U] = f30 & (u64)0x7fffffffffffffffU; add_scalar(f, f, (u64)19U * top_bit0); f31 = f[3U]; top_bit = f31 >> (u32)63U; f[3U] = f31 & (u64)0x7fffffffffffffffU; add_scalar(f, f, (u64)19U * top_bit); f0 = f[0U]; f1 = f[1U]; f2 = f[2U]; f3 = f[3U]; m0 = gte_mask(f0, (u64)0xffffffffffffffedU); m1 = eq_mask(f1, (u64)0xffffffffffffffffU); m2 = eq_mask(f2, (u64)0xffffffffffffffffU); m3 = eq_mask(f3, (u64)0x7fffffffffffffffU); mask = ((m0 & m1) & m2) & m3; f0_ = f0 - (mask & (u64)0xffffffffffffffedU); f1_ = f1 - (mask & (u64)0xffffffffffffffffU); f2_ = f2 - (mask & (u64)0xffffffffffffffffU); f3_ = f3 - (mask & (u64)0x7fffffffffffffffU); o0 = f0_; o1 = f1_; o2 = f2_; o3 = f3_; b[0U] = o0; b[1U] = o1; b[2U] = o2; b[3U] = o3; } static void encode_point(u8 *o, const u64 *i) { const u64 *x = i; const u64 *z = i + (u32)4U; u64 tmp[4U] = { 0U }; u64 tmp_w[16U] = { 0U }; finv(tmp, z, tmp_w); fmul(tmp, tmp, x, tmp_w); store_felem((u64 *)o, tmp); } static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub) { u64 init1[8U] = { 0U }; u64 tmp[4U] = { 0U }; u64 tmp3; u64 *x; u64 *z; { u32 i; for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) { u64 *os = tmp; const u8 *bj = pub + i * (u32)8U; u64 u = *(u64 *)bj; u64 r = u; u64 x0 = r; os[i] = x0; } } tmp3 = tmp[3U]; tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU; x = init1; z = init1 + (u32)4U; z[0U] = (u64)1U; z[1U] = (u64)0U; z[2U] = (u64)0U; z[3U] = (u64)0U; x[0U] = tmp[0U]; x[1U] = tmp[1U]; x[2U] = tmp[2U]; x[3U] = tmp[3U]; montgomery_ladder(init1, priv, init1); encode_point(out, init1); } /* The below constants were generated using this sage script: * * #!/usr/bin/env sage * import sys * from sage.all import * * def limbs(n): * n = int(n) * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64) * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0]) * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0] * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s)) * print("static const u64 table_ladder[] = {") * p = ec.lift_x(9) * for i in range(252): * l = (p[0] + p[2]) / (p[0] - p[2]) * print(("\t%s" + ("," if i != 251 else "")) % limbs(l)) * p = p * 2 * print("};") * */ static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL }; static const u64 table_ladder[] = { 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL, 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL, 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL, 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL, 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL, 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL, 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL, 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL, 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL, 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL, 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL, 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL, 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL, 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL, 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL, 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL, 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL, 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL, 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL, 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL, 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL, 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL, 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL, 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL, 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL, 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL, 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL, 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL, 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL, 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL, 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL, 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL, 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL, 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL, 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL, 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL, 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL, 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL, 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL, 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL, 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL, 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL, 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL, 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL, 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL, 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL, 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL, 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL, 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL, 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL, 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL, 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL, 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL, 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL, 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL, 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL, 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL, 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL, 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL, 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL, 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL, 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL, 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL, 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL, 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL, 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL, 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL, 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL, 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL, 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL, 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL, 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL, 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL, 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL, 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL, 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL, 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL, 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL, 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL, 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL, 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL, 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL, 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL, 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL, 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL, 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL, 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL, 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL, 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL, 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL, 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL, 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL, 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL, 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL, 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL, 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL, 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL, 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL, 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL, 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL, 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL, 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL, 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL, 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL, 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL, 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL, 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL, 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL, 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL, 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL, 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL, 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL, 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL, 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL, 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL, 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL, 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL, 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL, 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL, 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL, 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL, 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL, 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL, 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL, 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL, 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL, 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL, 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL, 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL, 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL, 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL, 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL, 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL, 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL, 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL, 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL, 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL, 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL, 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL, 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL, 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL, 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL, 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL, 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL, 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL, 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL, 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL, 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL, 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL, 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL, 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL, 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL, 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL, 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL, 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL, 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL, 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL, 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL, 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL, 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL, 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL, 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL, 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL, 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL, 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL, 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL, 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL, 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL, 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL, 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL, 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL, 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL, 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL, 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL, 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL, 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL, 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL, 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL, 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL, 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL, 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL, 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL, 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL, 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL, 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL, 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL, 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL, 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL, 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL, 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL, 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL, 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL, 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL, 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL, 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL, 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL, 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL, 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL, 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL, 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL, 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL, 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL, 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL, 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL, 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL, 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL, 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL, 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL, 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL, 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL, 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL, 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL, 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL, 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL, 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL, 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL, 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL, 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL, 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL, 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL, 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL, 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL, 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL, 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL, 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL, 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL, 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL, 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL, 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL, 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL, 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL, 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL, 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL, 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL, 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL, 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL, 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL, 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL, 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL, 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL, 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL, 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL, 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL, 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL, 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL, 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL, 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL, 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL, 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL, 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL, 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL, 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL }; static void curve25519_ever64_base(u8 *out, const u8 *priv) { u64 swap = 1; int i, j, k; u64 tmp[16 + 32 + 4]; u64 *x1 = &tmp[0]; u64 *z1 = &tmp[4]; u64 *x2 = &tmp[8]; u64 *z2 = &tmp[12]; u64 *xz1 = &tmp[0]; u64 *xz2 = &tmp[8]; u64 *a = &tmp[0 + 16]; u64 *b = &tmp[4 + 16]; u64 *c = &tmp[8 + 16]; u64 *ab = &tmp[0 + 16]; u64 *abcd = &tmp[0 + 16]; u64 *ef = &tmp[16 + 16]; u64 *efgh = &tmp[16 + 16]; u64 *key = &tmp[0 + 16 + 32]; memcpy(key, priv, 32); ((u8 *)key)[0] &= 248; ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64; x1[0] = 1, x1[1] = x1[2] = x1[3] = 0; z1[0] = 1, z1[1] = z1[2] = z1[3] = 0; z2[0] = 1, z2[1] = z2[2] = z2[3] = 0; memcpy(x2, p_minus_s, sizeof(p_minus_s)); j = 3; for (i = 0; i < 4; ++i) { while (j < (const int[]){ 64, 64, 64, 63 }[i]) { u64 bit = (key[i] >> j) & 1; k = (64 * i + j - 3); swap = swap ^ bit; cswap2(swap, xz1, xz2); swap = bit; fsub(b, x1, z1); fadd(a, x1, z1); fmul(c, &table_ladder[4 * k], b, ef); fsub(b, a, c); fadd(a, a, c); fsqr2(ab, ab, efgh); fmul2(xz1, xz2, ab, efgh); ++j; } j = 0; } point_double(xz1, abcd, efgh); point_double(xz1, abcd, efgh); point_double(xz1, abcd, efgh); encode_point(out, xz1); memzero_explicit(tmp, sizeof(tmp)); } static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx); void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE], const u8 basepoint[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64(mypublic, secret, basepoint); else curve25519_generic(mypublic, secret, basepoint); } EXPORT_SYMBOL(curve25519_arch); void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], const u8 secret[CURVE25519_KEY_SIZE]) { if (static_branch_likely(&curve25519_use_bmi2_adx)) curve25519_ever64_base(pub, secret); else curve25519_generic(pub, secret, curve25519_base_point); } EXPORT_SYMBOL(curve25519_base_arch); static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, unsigned int len) { u8 *secret = kpp_tfm_ctx(tfm); if (!len) curve25519_generate_secret(secret); else if (len == CURVE25519_KEY_SIZE && crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) memcpy(secret, buf, CURVE25519_KEY_SIZE); else return -EINVAL; return 0; } static int curve25519_generate_public_key(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); const u8 *secret = kpp_tfm_ctx(tfm); u8 buf[CURVE25519_KEY_SIZE]; int copied, nbytes; if (req->src) return -EINVAL; curve25519_base_arch(buf, secret); /* might want less than we've got */ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, nbytes), buf, nbytes); if (copied != nbytes) return -EINVAL; return 0; } static int curve25519_compute_shared_secret(struct kpp_request *req) { struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); const u8 *secret = kpp_tfm_ctx(tfm); u8 public_key[CURVE25519_KEY_SIZE]; u8 buf[CURVE25519_KEY_SIZE]; int copied, nbytes; if (!req->src) return -EINVAL; copied = sg_copy_to_buffer(req->src, sg_nents_for_len(req->src, CURVE25519_KEY_SIZE), public_key, CURVE25519_KEY_SIZE); if (copied != CURVE25519_KEY_SIZE) return -EINVAL; curve25519_arch(buf, secret, public_key); /* might want less than we've got */ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, nbytes), buf, nbytes); if (copied != nbytes) return -EINVAL; return 0; } static unsigned int curve25519_max_size(struct crypto_kpp *tfm) { return CURVE25519_KEY_SIZE; } static struct kpp_alg curve25519_alg = { .base.cra_name = "curve25519", .base.cra_driver_name = "curve25519-x86", .base.cra_priority = 200, .base.cra_module = THIS_MODULE, .base.cra_ctxsize = CURVE25519_KEY_SIZE, .set_secret = curve25519_set_secret, .generate_public_key = curve25519_generate_public_key, .compute_shared_secret = curve25519_compute_shared_secret, .max_size = curve25519_max_size, }; static int __init curve25519_mod_init(void) { if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) static_branch_enable(&curve25519_use_bmi2_adx); else return 0; return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? crypto_register_kpp(&curve25519_alg) : 0; } static void __exit curve25519_mod_exit(void) { if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && static_branch_likely(&curve25519_use_bmi2_adx)) crypto_unregister_kpp(&curve25519_alg); } module_init(curve25519_mod_init); module_exit(curve25519_mod_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-x86"); MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /** * file phonet.h * * Phonet sockets kernel interface * * Copyright (C) 2008 Nokia Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ #ifndef _UAPILINUX_PHONET_H #define _UAPILINUX_PHONET_H #include <linux/types.h> #include <linux/socket.h> /* Automatic protocol selection */ #define PN_PROTO_TRANSPORT 0 /* Phonet datagram socket */ #define PN_PROTO_PHONET 1 /* Phonet pipe */ #define PN_PROTO_PIPE 2 #define PHONET_NPROTO 3 /* Socket options for SOL_PNPIPE level */ #define PNPIPE_ENCAP 1 #define PNPIPE_IFINDEX 2 #define PNPIPE_HANDLE 3 #define PNPIPE_INITSTATE 4 #define PNADDR_ANY 0 #define PNADDR_BROADCAST 0xFC #define PNPORT_RESOURCE_ROUTING 0 /* Values for PNPIPE_ENCAP option */ #define PNPIPE_ENCAP_NONE 0 #define PNPIPE_ENCAP_IP 1 /* ioctls */ #define SIOCPNGETOBJECT (SIOCPROTOPRIVATE + 0) #define SIOCPNENABLEPIPE (SIOCPROTOPRIVATE + 13) #define SIOCPNADDRESOURCE (SIOCPROTOPRIVATE + 14) #define SIOCPNDELRESOURCE (SIOCPROTOPRIVATE + 15) /* Phonet protocol header */ struct phonethdr { __u8 pn_rdev; __u8 pn_sdev; __u8 pn_res; __be16 pn_length; __u8 pn_robj; __u8 pn_sobj; } __attribute__((packed)); /* Common Phonet payload header */ struct phonetmsg { __u8 pn_trans_id; /* transaction ID */ __u8 pn_msg_id; /* message type */ union { struct { __u8 pn_submsg_id; /* message subtype */ __u8 pn_data[5]; } base; struct { __u16 pn_e_res_id; /* extended resource ID */ __u8 pn_e_submsg_id; /* message subtype */ __u8 pn_e_data[3]; } ext; } pn_msg_u; }; #define PN_COMMON_MESSAGE 0xF0 #define PN_COMMGR 0x10 #define PN_PREFIX 0xE0 /* resource for extended messages */ #define pn_submsg_id pn_msg_u.base.pn_submsg_id #define pn_e_submsg_id pn_msg_u.ext.pn_e_submsg_id #define pn_e_res_id pn_msg_u.ext.pn_e_res_id #define pn_data pn_msg_u.base.pn_data #define pn_e_data pn_msg_u.ext.pn_e_data /* data for unreachable errors */ #define PN_COMM_SERVICE_NOT_IDENTIFIED_RESP 0x01 #define PN_COMM_ISA_ENTITY_NOT_REACHABLE_RESP 0x14 #define pn_orig_msg_id pn_data[0] #define pn_status pn_data[1] #define pn_e_orig_msg_id pn_e_data[0] #define pn_e_status pn_e_data[1] /* Phonet socket address structure */ struct sockaddr_pn { __kernel_sa_family_t spn_family; __u8 spn_obj; __u8 spn_dev; __u8 spn_resource; __u8 spn_zero[sizeof(struct sockaddr) - sizeof(__kernel_sa_family_t) - 3]; } __attribute__((packed)); /* Well known address */ #define PN_DEV_PC 0x10 static inline __u16 pn_object(__u8 addr, __u16 port) { return (addr << 8) | (port & 0x3ff); } static inline __u8 pn_obj(__u16 handle) { return handle & 0xff; } static inline __u8 pn_dev(__u16 handle) { return handle >> 8; } static inline __u16 pn_port(__u16 handle) { return handle & 0x3ff; } static inline __u8 pn_addr(__u16 handle) { return (handle >> 8) & 0xfc; } static inline void pn_sockaddr_set_addr(struct sockaddr_pn *spn, __u8 addr) { spn->spn_dev &= 0x03; spn->spn_dev |= addr & 0xfc; } static inline void pn_sockaddr_set_port(struct sockaddr_pn *spn, __u16 port) { spn->spn_dev &= 0xfc; spn->spn_dev |= (port >> 8) & 0x03; spn->spn_obj = port & 0xff; } static inline void pn_sockaddr_set_object(struct sockaddr_pn *spn, __u16 handle) { spn->spn_dev = pn_dev(handle); spn->spn_obj = pn_obj(handle); } static inline void pn_sockaddr_set_resource(struct sockaddr_pn *spn, __u8 resource) { spn->spn_resource = resource; } static inline __u8 pn_sockaddr_get_addr(const struct sockaddr_pn *spn) { return spn->spn_dev & 0xfc; } static inline __u16 pn_sockaddr_get_port(const struct sockaddr_pn *spn) { return ((spn->spn_dev & 0x03) << 8) | spn->spn_obj; } static inline __u16 pn_sockaddr_get_object(const struct sockaddr_pn *spn) { return pn_object(spn->spn_dev, spn->spn_obj); } static inline __u8 pn_sockaddr_get_resource(const struct sockaddr_pn *spn) { return spn->spn_resource; } /* Phonet device ioctl requests */ #endif /* _UAPILINUX_PHONET_H */
2 13 44 15 49 45 1 2 2 78 37 52 5 1 18 61 36 73 56 126 134 113 6 68 2 2 80 2 59 54 6 48 47 103 30 332 1 209 142 62 186 37 36 131 19 12 36 34 3 24 36 36 26 26 26 26 122 4 2 1 1 1 2 6 1008 9 952 960 121 4 126 90 36 650 646 66 357 315 49 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 // SPDX-License-Identifier: GPL-2.0-or-later /* scm.c - Socket level control messages processing. * * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Alignment and value checking mods by Craig Metz */ #include <linux/module.h> #include <linux/signal.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/pid_namespace.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/io_uring.h> #include <linux/uaccess.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> #include <net/af_unix.h> /* * Only allow a user to send credentials, that they could set with * setu(g)id. */ static __inline__ int scm_check_creds(struct ucred *creds) { const struct cred *cred = current_cred(); kuid_t uid = make_kuid(cred->user_ns, creds->uid); kgid_t gid = make_kgid(cred->user_ns, creds->gid); if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; if ((creds->pid == task_tgid_vnr(current) || ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) { return 0; } return -EPERM; } static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) { int *fdp = (int*)CMSG_DATA(cmsg); struct scm_fp_list *fpl = *fplp; struct file **fpp; int i, num; num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int); if (num <= 0) return 0; if (num > SCM_MAX_FD) return -EINVAL; if (!fpl) { fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; fpl->count = 0; fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; #if IS_ENABLED(CONFIG_UNIX) fpl->inflight = false; fpl->dead = false; fpl->edges = NULL; INIT_LIST_HEAD(&fpl->vertices); #endif } fpp = &fpl->fp[fpl->count]; if (fpl->count + num > fpl->max) return -EINVAL; /* * Verify the descriptors and increment the usage count. */ for (i=0; i< num; i++) { int fd = fdp[i]; struct file *file; if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; /* don't allow io_uring files */ if (io_is_uring_fops(file)) { fput(file); return -EINVAL; } if (unix_get_socket(file)) fpl->count_unix++; *fpp++ = file; fpl->count++; } if (!fpl->user) fpl->user = get_uid(current_user()); return num; } void __scm_destroy(struct scm_cookie *scm) { struct scm_fp_list *fpl = scm->fp; int i; if (fpl) { scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); free_uid(fpl->user); kfree(fpl); } } EXPORT_SYMBOL(__scm_destroy); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; for_each_cmsghdr(cmsg, msg) { err = -EINVAL; /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ /* The first check was omitted in <= 2.2.5. The reasoning was that parser checks cmsg_len in any case, so that additional check would be work duplication. But if cmsg_level is not SOL_SOCKET, we do not check for too short ancillary data object at all! Oops. OK, let's add it... */ if (!CMSG_OK(msg, cmsg)) goto error; if (cmsg->cmsg_level != SOL_SOCKET) continue; switch (cmsg->cmsg_type) { case SCM_RIGHTS: if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) goto error; break; case SCM_CREDENTIALS: { struct ucred creds; kuid_t uid; kgid_t gid; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) goto error; memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred)); err = scm_check_creds(&creds); if (err) goto error; p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; put_pid(p->pid); p->pid = pid; } err = -EINVAL; uid = make_kuid(current_user_ns(), creds.uid); gid = make_kgid(current_user_ns(), creds.gid); if (!uid_valid(uid) || !gid_valid(gid)) goto error; p->creds.uid = uid; p->creds.gid = gid; break; } default: goto error; } } if (p->fp && !p->fp->count) { kfree(p->fp); p->fp = NULL; } return 0; error: scm_destroy(p); return err; } EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { int cmlen = CMSG_LEN(len); if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } if (msg->msg_controllen < cmlen) { msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } if (msg->msg_control_is_user) { struct cmsghdr __user *cm = msg->msg_control_user; check_object_size(data, cmlen - sizeof(*cm), true); if (!user_write_access_begin(cm, cmlen)) goto efault; unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); unsafe_put_user(level, &cm->cmsg_level, efault_end); unsafe_put_user(type, &cm->cmsg_type, efault_end); unsafe_copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm), efault_end); user_write_access_end(); } else { struct cmsghdr *cm = msg->msg_control; cm->cmsg_level = level; cm->cmsg_type = type; cm->cmsg_len = cmlen; memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); } cmlen = min(CMSG_SPACE(len), msg->msg_controllen); if (msg->msg_control_is_user) msg->msg_control_user += cmlen; else msg->msg_control += cmlen; msg->msg_controllen -= cmlen; return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } EXPORT_SYMBOL(put_cmsg); int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, void *data) { /* Don't produce truncated CMSGs */ if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) return -ETOOSMALL; return put_cmsg(msg, level, type, len, data); } void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); } EXPORT_SYMBOL(put_cmsg_scm_timestamping64); void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping tss; int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); } EXPORT_SYMBOL(put_cmsg_scm_timestamping); static int scm_max_fds(struct msghdr *msg) { if (msg->msg_controllen <= sizeof(struct cmsghdr)) return 0; return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); } void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm = (__force struct cmsghdr __user *)msg->msg_control_user; unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; /* no use for FD passing from kernel space callers */ if (WARN_ON_ONCE(!msg->msg_control_is_user)) return; if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } for (i = 0; i < fdmax; i++) { err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } if (i > 0) { int cmlen = CMSG_LEN(i * sizeof(int)); err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) err = put_user(cmlen, &cm->cmsg_len); if (!err) { cmlen = CMSG_SPACE(i * sizeof(int)); if (msg->msg_controllen < cmlen) cmlen = msg->msg_controllen; msg->msg_control_user += cmlen; msg->msg_controllen -= cmlen; } } if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) msg->msg_flags |= MSG_CTRUNC; /* * All of the files that fit in the message have had their usage counts * incremented, so we just free the list. */ __scm_destroy(scm); } EXPORT_SYMBOL(scm_detach_fds); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) { struct scm_fp_list *new_fpl; int i; if (!fpl) return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); #if IS_ENABLED(CONFIG_UNIX) new_fpl->inflight = false; new_fpl->edges = NULL; INIT_LIST_HEAD(&new_fpl->vertices); #endif } return new_fpl; } EXPORT_SYMBOL(scm_fp_dup); #ifdef CONFIG_SECURITY_NETWORK static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) { struct lsm_context ctx; int err; if (sk->sk_scm_security) { err = security_secid_to_secctx(scm->secid, &ctx); if (err >= 0) { put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, ctx.context); security_release_secctx(&ctx); } } } static bool scm_has_secdata(struct sock *sk) { return sk->sk_scm_security; } #else static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) { } static bool scm_has_secdata(struct sock *sk) { return false; } #endif static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) { struct file *pidfd_file = NULL; int len, pidfd; /* put_cmsg() doesn't return an error if CMSG is truncated, * that's why we need to opencode these checks here. */ if (msg->msg_flags & MSG_CMSG_COMPAT) len = sizeof(struct compat_cmsghdr) + sizeof(int); else len = sizeof(struct cmsghdr) + sizeof(int); if (msg->msg_controllen < len) { msg->msg_flags |= MSG_CTRUNC; return; } if (!scm->pid) return; pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { if (pidfd_file) { put_unused_fd(pidfd); fput(pidfd_file); } return; } if (pidfd_file) fd_install(pidfd, pidfd_file); } static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { if (sk->sk_scm_credentials || sk->sk_scm_pidfd || scm->fp || scm_has_secdata(sk)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return false; } if (sk->sk_scm_credentials) { struct user_namespace *current_ns = current_user_ns(); struct ucred ucreds = { .pid = scm->creds.pid, .uid = from_kuid_munged(current_ns, scm->creds.uid), .gid = from_kgid_munged(current_ns, scm->creds.gid), }; put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } scm_passec(sk, msg, scm); if (scm->fp) scm_detach_fds(msg, scm); return true; } void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!__scm_recv_common(sock->sk, msg, scm, flags)) return; scm_destroy_cred(scm); } EXPORT_SYMBOL(scm_recv); void scm_recv_unix(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!__scm_recv_common(sock->sk, msg, scm, flags)) return; if (sock->sk->sk_scm_pidfd) scm_pidfd_recv(msg, scm); scm_destroy_cred(scm); }
56 55 56 56 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif vc[CS_RAW].mask = tk->tkr_raw.mask; vc[CS_RAW].mult = tk->tkr_raw.mult; vc[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_time_data *vdata = vdso_k_time_data; struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vc[CS_HRES_COARSE].clock_mode = clock_mode; vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). */ WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_time_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { struct vdso_time_data *vdata = vdso_k_time_data; vdata->tz_minuteswest = sys_tz.tz_minuteswest; vdata->tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_time_data(vdata); } /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); }
3472 1095 1091 966 26 968 39 4 4942 4361 4362 3504 3508 504 505 222 222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 // SPDX-License-Identifier: GPL-2.0-only /* * mm/interval_tree.c - interval tree for mapping->i_mmap * * Copyright (C) 2012, Michel Lespinasse <walken@google.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/rmap.h> #include <linux/interval_tree_generic.h> static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) { return v->vm_pgoff; } static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { return v->vm_pgoff + vma_pages(v) - 1; } INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root) { struct rb_node **link; struct vm_area_struct *parent; unsigned long last = vma_last_pgoff(node); VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); if (!prev->shared.rb.rb_right) { parent = prev; link = &prev->shared.rb.rb_right; } else { parent = rb_entry(prev->shared.rb.rb_right, struct vm_area_struct, shared.rb); if (parent->shared.rb_subtree_last < last) parent->shared.rb_subtree_last = last; while (parent->shared.rb.rb_left) { parent = rb_entry(parent->shared.rb.rb_left, struct vm_area_struct, shared.rb); if (parent->shared.rb_subtree_last < last) parent->shared.rb_subtree_last = last; } link = &parent->shared.rb.rb_left; } node->shared.rb_subtree_last = last; rb_link_node(&node->shared.rb, &parent->shared.rb, link); rb_insert_augmented(&node->shared.rb, &root->rb_root, &vma_interval_tree_augment); } static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) { return vma_start_pgoff(avc->vma); } static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) { return vma_last_pgoff(avc->vma); } INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, avc_start_pgoff, avc_last_pgoff, static inline, __anon_vma_interval_tree) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root) { #ifdef CONFIG_DEBUG_VM_RB node->cached_vma_start = avc_start_pgoff(node); node->cached_vma_last = avc_last_pgoff(node); #endif __anon_vma_interval_tree_insert(node, root); } void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root) { __anon_vma_interval_tree_remove(node, root); } struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_first(root, first, last); } struct anon_vma_chain * anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_next(node, first, last); } #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node) { WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); } #endif
28 28 254 267 254 13 268 268 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 // SPDX-License-Identifier: GPL-2.0-or-later /* * Authenc: Simple AEAD wrapper for IPsec * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/authenc.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> struct authenc_instance_ctx { struct crypto_ahash_spawn auth; struct crypto_skcipher_spawn enc; unsigned int reqoff; }; struct crypto_authenc_ctx { struct crypto_ahash *auth; struct crypto_skcipher *enc; }; struct authenc_request_ctx { struct scatterlist src[2]; struct scatterlist dst[2]; char tail[]; }; static void authenc_request_complete(struct aead_request *req, int err) { if (err != -EINPROGRESS) aead_request_complete(req, err); } int crypto_authenc_extractkeys(struct crypto_authenc_keys *keys, const u8 *key, unsigned int keylen) { struct rtattr *rta = (struct rtattr *)key; struct crypto_authenc_key_param *param; if (!RTA_OK(rta, keylen)) return -EINVAL; if (rta->rta_type != CRYPTO_AUTHENC_KEYA_PARAM) return -EINVAL; /* * RTA_OK() didn't align the rtattr's payload when validating that it * fits in the buffer. Yet, the keys should start on the next 4-byte * aligned boundary. To avoid confusion, require that the rtattr * payload be exactly the param struct, which has a 4-byte aligned size. */ if (RTA_PAYLOAD(rta) != sizeof(*param)) return -EINVAL; BUILD_BUG_ON(sizeof(*param) % RTA_ALIGNTO); param = RTA_DATA(rta); keys->enckeylen = be32_to_cpu(param->enckeylen); key += rta->rta_len; keylen -= rta->rta_len; if (keylen < keys->enckeylen) return -EINVAL; keys->authkeylen = keylen - keys->enckeylen; keys->authkey = key; keys->enckey = key + keys->authkeylen; return 0; } EXPORT_SYMBOL_GPL(crypto_authenc_extractkeys); static int crypto_authenc_setkey(struct crypto_aead *authenc, const u8 *key, unsigned int keylen) { struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc); struct crypto_ahash *auth = ctx->auth; struct crypto_skcipher *enc = ctx->enc; struct crypto_authenc_keys keys; int err = -EINVAL; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) goto out; crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen); if (err) goto out; crypto_skcipher_clear_flags(enc, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen); out: memzero_explicit(&keys, sizeof(keys)); return err; } static void authenc_geniv_ahash_done(void *data, int err) { struct aead_request *req = data; struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct aead_instance *inst = aead_alg_instance(authenc); struct authenc_instance_ctx *ictx = aead_instance_ctx(inst); struct authenc_request_ctx *areq_ctx = aead_request_ctx(req); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff); if (err) goto out; scatterwalk_map_and_copy(ahreq->result, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(authenc), 1); out: aead_request_complete(req, err); } static int crypto_authenc_genicv(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct aead_instance *inst = aead_alg_instance(authenc); struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc); struct authenc_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_ahash *auth = ctx->auth; struct authenc_request_ctx *areq_ctx = aead_request_ctx(req); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff); u8 *hash = areq_ctx->tail; int err; ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, req->dst, hash, req->assoclen + req->cryptlen); ahash_request_set_callback(ahreq, flags, authenc_geniv_ahash_done, req); err = crypto_ahash_digest(ahreq); if (err) return err; scatterwalk_map_and_copy(hash, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(authenc), 1); return 0; } static void crypto_authenc_encrypt_done(void *data, int err) { struct aead_request *areq = data; if (err) goto out; err = crypto_authenc_genicv(areq, 0); out: authenc_request_complete(areq, err); } static int crypto_authenc_encrypt(struct aead_request *req) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct aead_instance *inst = aead_alg_instance(authenc); struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc); struct authenc_instance_ctx *ictx = aead_instance_ctx(inst); struct authenc_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_skcipher *enc = ctx->enc; unsigned int cryptlen = req->cryptlen; struct skcipher_request *skreq = (void *)(areq_ctx->tail + ictx->reqoff); struct scatterlist *src, *dst; int err; src = scatterwalk_ffwd(areq_ctx->src, req->src, req->assoclen); dst = src; if (req->src != req->dst) { memcpy_sglist(req->dst, req->src, req->assoclen); dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, req->assoclen); } skcipher_request_set_tfm(skreq, enc); skcipher_request_set_callback(skreq, aead_request_flags(req), crypto_authenc_encrypt_done, req); skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; return crypto_authenc_genicv(req, aead_request_flags(req)); } static int crypto_authenc_decrypt_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); struct aead_instance *inst = aead_alg_instance(authenc); struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc); struct authenc_instance_ctx *ictx = aead_instance_ctx(inst); struct authenc_request_ctx *areq_ctx = aead_request_ctx(req); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ictx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc); u8 *ihash = ahreq->result + authsize; struct scatterlist *src, *dst; scatterwalk_map_and_copy(ihash, req->src, ahreq->nbytes, authsize, 0); if (crypto_memneq(ihash, ahreq->result, authsize)) return -EBADMSG; src = scatterwalk_ffwd(areq_ctx->src, req->src, req->assoclen); dst = src; if (req->src != req->dst) dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, req->assoclen); skcipher_request_set_tfm(skreq, ctx->enc); skcipher_request_set_callback(skreq, flags, req->base.complete, req->base.data); skcipher_request_set_crypt(skreq, src, dst, req->cryptlen - authsize, req->iv); return crypto_skcipher_decrypt(skreq); } static void authenc_verify_ahash_done(void *data, int err) { struct aead_request *req = data; if (err) goto out; err = crypto_authenc_decrypt_tail(req, 0); out: authenc_request_complete(req, err); } static int crypto_authenc_decrypt(struct aead_request *req) { struct crypto_aead *authenc = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(authenc); struct aead_instance *inst = aead_alg_instance(authenc); struct crypto_authenc_ctx *ctx = crypto_aead_ctx(authenc); struct authenc_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_ahash *auth = ctx->auth; struct authenc_request_ctx *areq_ctx = aead_request_ctx(req); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ictx->reqoff); u8 *hash = areq_ctx->tail; int err; ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, req->src, hash, req->assoclen + req->cryptlen - authsize); ahash_request_set_callback(ahreq, aead_request_flags(req), authenc_verify_ahash_done, req); err = crypto_ahash_digest(ahreq); if (err) return err; return crypto_authenc_decrypt_tail(req, aead_request_flags(req)); } static int crypto_authenc_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct authenc_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_authenc_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *auth; struct crypto_skcipher *enc; int err; auth = crypto_spawn_ahash(&ictx->auth); if (IS_ERR(auth)) return PTR_ERR(auth); enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) goto err_free_ahash; ctx->auth = auth; ctx->enc = enc; crypto_aead_set_reqsize( tfm, sizeof(struct authenc_request_ctx) + ictx->reqoff + max_t(unsigned int, crypto_ahash_reqsize(auth) + sizeof(struct ahash_request), sizeof(struct skcipher_request) + crypto_skcipher_reqsize(enc))); return 0; err_free_ahash: crypto_free_ahash(auth); return err; } static void crypto_authenc_exit_tfm(struct crypto_aead *tfm) { struct crypto_authenc_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->auth); crypto_free_skcipher(ctx->enc); } static void crypto_authenc_free(struct aead_instance *inst) { struct authenc_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->enc); crypto_drop_ahash(&ctx->auth); kfree(inst); } static int crypto_authenc_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct authenc_instance_ctx *ctx; struct skcipher_alg_common *enc; struct hash_alg_common *auth; struct crypto_alg *auth_base; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ctx->auth, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; auth = crypto_spawn_ahash_alg(&ctx->auth); auth_base = &auth->base; err = crypto_grab_skcipher(&ctx->enc, aead_crypto_instance(inst), crypto_attr_alg_name(tb[2]), 0, mask); if (err) goto err_free_inst; enc = crypto_spawn_skcipher_alg_common(&ctx->enc); ctx->reqoff = 2 * auth->digestsize; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", auth_base->cra_name, enc->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", auth_base->cra_driver_name, enc->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = enc->base.cra_priority * 10 + auth_base->cra_priority; inst->alg.base.cra_blocksize = enc->base.cra_blocksize; inst->alg.base.cra_alignmask = enc->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_authenc_ctx); inst->alg.ivsize = enc->ivsize; inst->alg.chunksize = enc->chunksize; inst->alg.maxauthsize = auth->digestsize; inst->alg.init = crypto_authenc_init_tfm; inst->alg.exit = crypto_authenc_exit_tfm; inst->alg.setkey = crypto_authenc_setkey; inst->alg.encrypt = crypto_authenc_encrypt; inst->alg.decrypt = crypto_authenc_decrypt; inst->free = crypto_authenc_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_authenc_free(inst); } return err; } static struct crypto_template crypto_authenc_tmpl = { .name = "authenc", .create = crypto_authenc_create, .module = THIS_MODULE, }; static int __init crypto_authenc_module_init(void) { return crypto_register_template(&crypto_authenc_tmpl); } static void __exit crypto_authenc_module_exit(void) { crypto_unregister_template(&crypto_authenc_tmpl); } module_init(crypto_authenc_module_init); module_exit(crypto_authenc_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Simple AEAD wrapper for IPsec"); MODULE_ALIAS_CRYPTO("authenc");
35 12 14 14 14 7 1 6 2 2 2 2 9 9 9 10 10 1 9 8 12 8 2 3 1 7 2 12 12 12 2 12 12 12 12 3 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 // SPDX-License-Identifier: GPL-2.0 /* * FPU register's regset abstraction, for ptrace, core dumps, etc. */ #include <linux/sched/task_stack.h> #include <linux/vmalloc.h> #include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/prctl.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, * as the "regset->n" for the xstate regset will be updated based on the feature * capabilities supported by the xsave. */ int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { return regset->n; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { if (boot_cpu_has(X86_FEATURE_FXSR)) return regset->n; else return 0; } /* * The regset get() functions are invoked from: * * - coredump to dump the current task's fpstate. If the current task * owns the FPU then the memory state has to be synchronized and the * FPU register state preserved. Otherwise fpstate is already in sync. * * - ptrace to dump fpstate of a stopped task, in which case the registers * have already been saved to fpstate on context switch. */ static void sync_fpstate(struct fpu *fpu) { if (fpu == x86_task_fpu(current)) fpu_sync_fpstate(fpu); } /* * Invalidate cached FPU registers before modifying the stopped target * task's fpstate. * * This forces the target task on resume to restore the FPU registers from * modified fpstate. Otherwise the task might skip the restore and operate * with the cached FPU registers which discards the modifications. */ static void fpu_force_restore(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU * state in the fpstate buffer: */ WARN_ON_FPU(fpu == x86_task_fpu(current)); __fpu_invalidate_fpregs_state(fpu); } int xfpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct fpu *fpu = x86_task_fpu(target); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; sync_fpstate(fpu); if (!use_xsave()) { return membuf_write(&to, &fpu->fpstate->regs.fxsave, sizeof(fpu->fpstate->regs.fxsave)); } copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); return 0; } int xfpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = x86_task_fpu(target); struct fxregs_state newstate; int ret; if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; /* No funny business with partial or oversized writes is permitted. */ if (pos != 0 || count != sizeof(newstate)) return -EINVAL; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1); if (ret) return ret; /* Do not allow an invalid MXCSR value. */ if (newstate.mxcsr & ~mxcsr_feature_mask) return -EINVAL; fpu_force_restore(fpu); /* Copy the state */ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); /* Clear xmm8..15 for 32-bit callers */ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); if (in_ia32_syscall()) memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16); /* Mark FP and SSE as in use when XSAVE is enabled */ if (use_xsave()) fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return 0; } int xstateregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; sync_fpstate(x86_task_fpu(target)); copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); return 0; } int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = x86_task_fpu(target); struct xregs_state *tmpbuf = NULL; int ret; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; /* * A whole standard-format XSAVE buffer is needed: */ if (pos != 0 || count != fpu_user_cfg.max_size) return -EFAULT; if (!kbuf) { tmpbuf = vmalloc(count); if (!tmpbuf) return -ENOMEM; if (copy_from_user(tmpbuf, ubuf, count)) { ret = -EFAULT; goto out; } } fpu_force_restore(fpu); ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru); out: vfree(tmpbuf); return ret; } #ifdef CONFIG_X86_USER_SHADOW_STACK int ssp_active(struct task_struct *target, const struct user_regset *regset) { if (target->thread.features & ARCH_SHSTK_SHSTK) return regset->n; return 0; } int ssp_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct fpu *fpu = x86_task_fpu(target); struct cet_user_state *cetregs; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !ssp_active(target, regset)) return -ENODEV; sync_fpstate(fpu); cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER); if (WARN_ON(!cetregs)) { /* * This shouldn't ever be NULL because shadow stack was * verified to be enabled above. This means * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so * XFEATURE_CET_USER should not be in the init state. */ return -ENODEV; } return membuf_write(&to, (unsigned long *)&cetregs->user_ssp, sizeof(cetregs->user_ssp)); } int ssp_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = x86_task_fpu(target); struct xregs_state *xsave = &fpu->fpstate->regs.xsave; struct cet_user_state *cetregs; unsigned long user_ssp; int r; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !ssp_active(target, regset)) return -ENODEV; if (pos != 0 || count != sizeof(user_ssp)) return -EINVAL; r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1); if (r) return r; /* * Some kernel instructions (IRET, etc) can cause exceptions in the case * of disallowed CET register values. Just prevent invalid values. */ if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8)) return -EINVAL; fpu_force_restore(fpu); cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER); if (WARN_ON(!cetregs)) { /* * This shouldn't ever be NULL because shadow stack was * verified to be enabled above. This means * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so * XFEATURE_CET_USER should not be in the init state. */ return -ENODEV; } cetregs->user_ssp = user_ssp; return 0; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* * FPU tag word conversions. */ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) { unsigned int tmp; /* to avoid 16 bit prefixes in the code */ /* Transform each pair of bits into 01 (valid) or 00 (empty) */ tmp = ~twd; tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ /* and move the valid bits to the lower byte. */ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ return tmp; } #define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) #define FP_EXP_TAG_VALID 0 #define FP_EXP_TAG_ZERO 1 #define FP_EXP_TAG_SPECIAL 2 #define FP_EXP_TAG_EMPTY 3 static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave) { struct _fpxreg *st; u32 tos = (fxsave->swd >> 11) & 7; u32 twd = (unsigned long) fxsave->twd; u32 tag; u32 ret = 0xffff0000u; int i; for (i = 0; i < 8; i++, twd >>= 1) { if (twd & 0x1) { st = FPREG_ADDR(fxsave, (i - tos) & 7); switch (st->exponent & 0x7fff) { case 0x7fff: tag = FP_EXP_TAG_SPECIAL; break; case 0x0000: if (!st->significand[0] && !st->significand[1] && !st->significand[2] && !st->significand[3]) tag = FP_EXP_TAG_ZERO; else tag = FP_EXP_TAG_SPECIAL; break; default: if (st->significand[3] & 0x8000) tag = FP_EXP_TAG_VALID; else tag = FP_EXP_TAG_SPECIAL; break; } } else { tag = FP_EXP_TAG_EMPTY; } ret |= tag << (2 * i); } return ret; } /* * FXSR floating point environment conversions. */ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk, struct fxregs_state *fxsave) { struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; int i; env->cwd = fxsave->cwd | 0xffff0000u; env->swd = fxsave->swd | 0xffff0000u; env->twd = twd_fxsr_to_i387(fxsave); #ifdef CONFIG_X86_64 env->fip = fxsave->rip; env->foo = fxsave->rdp; /* * should be actually ds/cs at fpu exception time, but * that information is not available in 64bit mode. */ env->fcs = task_pt_regs(tsk)->cs; if (tsk == current) { savesegment(ds, env->fos); } else { env->fos = tsk->thread.ds; } env->fos |= 0xffff0000; #else env->fip = fxsave->fip; env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); env->foo = fxsave->foo; env->fos = fxsave->fos; #endif for (i = 0; i < 8; ++i) memcpy(&to[i], &from[i], sizeof(to[0])); } void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env) { struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; int i; fxsave->cwd = env->cwd; fxsave->swd = env->swd; fxsave->twd = twd_i387_to_fxsr(env->twd); fxsave->fop = (u16) ((u32) env->fcs >> 16); #ifdef CONFIG_X86_64 fxsave->rip = env->fip; fxsave->rdp = env->foo; /* cs and ds ignored */ #else fxsave->fip = env->fip; fxsave->fcs = (env->fcs & 0xffff); fxsave->foo = env->foo; fxsave->fos = env->fos; #endif for (i = 0; i < 8; ++i) memcpy(&to[i], &from[i], sizeof(from[0])); } int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; struct fxregs_state fxsave, *fx; sync_fpstate(fpu); if (!cpu_feature_enabled(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, to); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { return membuf_write(&to, &fpu->fpstate->regs.fsave, sizeof(struct fregs_state)); } if (use_xsave()) { struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) }; /* Handle init state optimized xstate correctly */ copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); fx = &fxsave; } else { fx = &fpu->fpstate->regs.fxsave; } __convert_from_fxsr(&env, target, fx); return membuf_write(&to, &env, sizeof(env)); } int fpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; int ret; /* No funny business with partial or oversized writes is permitted. */ if (pos != 0 || count != sizeof(struct user_i387_ia32_struct)) return -EINVAL; if (!cpu_feature_enabled(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); if (ret) return ret; fpu_force_restore(fpu); if (cpu_feature_enabled(X86_FEATURE_FXSR)) convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env); else memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env)); /* * Update the header bit in the xsave header, indicating the * presence of FP. */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP; return 0; } #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
8 1 1 22 148 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * inet6 interface/address list definitions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IF_INET6_H #define _NET_IF_INET6_H #include <net/snmp.h> #include <linux/ipv6.h> #include <linux/refcount.h> /* inet6_dev.if_flags */ #define IF_RA_OTHERCONF 0x80 #define IF_RA_MANAGED 0x40 #define IF_RA_RCVD 0x20 #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, INET6_IFADDR_STATE_DEAD, }; struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; __u32 prefered_lft; refcount_t refcnt; spinlock_t lock; int state; __u32 flags; __u8 dad_probes; __u8 stable_privacy_retry; __u16 scope; __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ struct delayed_work dad_work; struct inet6_dev *idev; struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; /* * Used to safely traverse idev->addr_list in process context * if the idev->lock needed to protect idev->addr_list cannot be held. * In that case, add the items to this list temporarily and iterate * without holding idev->lock. * See addrconf_ifdown and dev_forward_change. */ struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; int regen_count; bool tokenized; u8 ifa_proto; struct rcu_head rcu; struct in6_addr peer_addr; }; struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; struct in6_addr sl_addr[] __counted_by(sl_max); }; #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip6_sf_list { struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 #define MAF_LAST_REPORTER 0x02 #define MAF_LOADED 0x04 #define MAF_NOREPORT 0x08 #define MAF_GSQUERY 0x10 struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 __rcu *next; struct ip6_sf_list __rcu *mca_sources; struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* Anycast stuff */ struct ipv6_ac_socklist { struct in6_addr acl_addr; int acl_ifindex; struct ipv6_ac_socklist *acl_next; }; struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; struct ifacaddr6 __rcu *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; unsigned long aca_cstamp; unsigned long aca_tstamp; struct rcu_head rcu; }; #define IFA_HOST IPV6_ADDR_LOOPBACK #define IFA_LINK IPV6_ADDR_LINKLOCAL #define IFA_SITE IPV6_ADDR_SITELOCAL struct ipv6_devstat { struct proc_dir_entry *proc_dir_entry; DEFINE_SNMP_STAT(struct ipstats_mib, ipv6); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev); }; struct inet6_dev { struct net_device *dev; netdevice_tracker dev_tracker; struct list_head addr_list; struct ifmcaddr6 __rcu *mc_list; struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ unsigned long mc_qi; /* Query Interval */ unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; struct delayed_work mc_gq_work; /* general query work */ struct delayed_work mc_ifc_work; /* interface change work */ struct delayed_work mc_dad_work; /* dad complete mc work */ struct delayed_work mc_query_work; /* mld query work */ struct delayed_work mc_report_work; /* mld report work */ struct sk_buff_head mc_query_queue; /* mld query queue */ struct sk_buff_head mc_report_queue; /* mld report queue */ spinlock_t mc_query_lock; /* mld query queue lock */ spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ struct ifacaddr6 __rcu *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; int dead; u32 desync_factor; struct list_head tempaddr_list; struct in6_addr token; struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; struct timer_list rs_timer; __s32 rs_interval; /* in jiffies */ __u8 rs_probes; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) { /* * +-------+-------+-------+-------+-------+-------+ * | 33 | 33 | DST13 | DST14 | DST15 | DST16 | * +-------+-------+-------+-------+-------+-------+ */ buf[0]= 0x33; buf[1]= 0x33; memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32)); } static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf) { buf[0] = 0x00; } static inline void ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x60; /* IPv6 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; memcpy(buf + 10, addr->s6_addr + 6, 10); } static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) { memcpy(buf, broadcast, 4); } else { /* v4mapped? */ if ((addr->s6_addr32[0] | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0) return -EINVAL; memcpy(buf, &addr->s6_addr32[3], 4); } return 0; } #endif
349 40 193 2 21 21 21 155 195 109 153 53 53 106 218 193 68 68 79 1 78 28 78 4 85 3 87 87 18 16 8 6 2 13 2 8 3 3 16 10 6 98 18 22 24 23 2 37 35 53 31 33 54 37 7 42 29 18 1 8 59 60 4 6 6 85 43 10 34 54 18 54 36 60 81 80 21 8 13 4 9 13 37 48 85 12 46 32 7 28 17 68 13 8 4 2 16 1 16 10 5 11 4 27 27 9 9 9 27 7 7 26 3 3 27 27 5 5 27 6 6 27 27 10 12 12 7 3 5 1 2 4 7 10 5 5 14 8 6 14 2 12 9 1 1 2 7 57 16 1 9 7 3 9 9 21 2 24 21 2 20 20 20 2 2 3 5 11 38 50 5 4 34 14 27 3 1 2 3 2 113 5 10 9 4 129 128 231 18 56 1 50 56 9 9 59 60 18 50 50 110 1 102 2 6 47 51 61 61 14 9 1 90 36 12 8 1 13 9 4 21 15 8 46 25 7 122 1 13 8 112 110 92 8 8 52 60 60 73 1494 1310 189 189 687 353 312 42 27 27 27 27 26 16 27 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/sem.c * Copyright (C) 1992 Krishna Balasubramanian * Copyright (C) 1995 Eric Schenk, Bruno Haible * * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * * SMP-threaded, sysctl's added * (c) 1999 Manfred Spraul <manfred@colorfullife.com> * Enforced range limit on SEM_UNDO * (c) 2001 Red Hat Inc * Lockless wakeup * (c) 2003 Manfred Spraul <manfred@colorfullife.com> * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> * Further wakeup optimizations, documentation * (c) 2010 Manfred Spraul <manfred@colorfullife.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * Implementation notes: (May 2010) * This file implements System V semaphores. * * User space visible behavior: * - FIFO ordering for semop() operations (just FIFO, not starvation * protection) * - multiple semaphore operations that alter the same semaphore in * one semop() are handled. * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and * SETALL calls. * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. * - undo adjustments at process exit are limited to 0..SEMVMX. * - namespace are supported. * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing * to /proc/sys/kernel/sem. * - statistics about the usage are reported in /proc/sysvipc/sem. * * Internals: * - scalability: * - all global variables are read-mostly. * - semop() calls and semctl(RMID) are synchronized by RCU. * - most operations do write operations (actually: spin_lock calls) to * the per-semaphore array structure. * Thus: Perfect SMP scaling between independent semaphore arrays. * If multiple semaphores in one array are used, then cache line * trashing on the semaphore array spinlock will limit the scaling. * - semncnt and semzcnt are calculated on demand in count_semcnt() * - the task that performs a successful semop() scans the list of all * sleeping tasks and completes any pending operations that can be fulfilled. * Semaphores are actively given to waiting tasks (necessary for FIFO). * (see update_queue()) * - To improve the scalability, the actual wake-up calls are performed after * dropping all locks. (see wake_up_sem_queue_prepare()) * - All work is done by the waker, the woken up task does not have to do * anything - not even acquiring a lock or dropping a refcount. * - A woken up task may not even touch the semaphore array anymore, it may * have been destroyed already by a semctl(RMID). * - UNDO values are stored in an array (one per process and per * semaphore array, lazily allocated). For backwards compatibility, multiple * modes for the UNDO variables are supported (per process, per thread) * (see copy_semundo, CLONE_SYSVSEM) * - There are two lists of the pending operations: a per-array list * and per-semaphore list (stored in the array). This allows to achieve FIFO * ordering without always scanning all pending operations. * The worst-case behavior is nevertheless O(N^2) for N wakeups. */ #include <linux/compat.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/time.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/sched/wake_q.h> #include <linux/nospec.h> #include <linux/rhashtable.h> #include <linux/uaccess.h> #include "util.h" /* One semaphore structure for each semaphore in the system. */ struct sem { int semval; /* current value */ /* * PID of the process that last modified the semaphore. For * Linux, specifically these are: * - semop * - semctl, via SETVAL and SETALL. * - at task exit when performing undo adjustments (see exit_sem). */ struct pid *sempid; spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head pending_alter; /* pending single-sop operations */ /* that alter the semaphore */ struct list_head pending_const; /* pending single-sop operations */ /* that do not alter the semaphore*/ time64_t sem_otime; /* candidate for sem_otime */ } ____cacheline_aligned_in_smp; /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ time64_t sem_ctime; /* create/last semctl() time */ struct list_head pending_alter; /* pending operations */ /* that alter the array */ struct list_head pending_const; /* pending complex operations */ /* that do not alter semvals */ struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ struct sem sems[]; } __randomize_layout; /* One queue for each sleeping process in the system. */ struct sem_queue { struct list_head list; /* queue of pending operations */ struct task_struct *sleeper; /* this process */ struct sem_undo *undo; /* undo structure */ struct pid *pid; /* process id of requesting process */ int status; /* completion status of operation */ struct sembuf *sops; /* array of pending operations */ struct sembuf *blocking; /* the operation that blocked */ int nsops; /* number of operations */ bool alter; /* does *sops alter the array? */ bool dupsop; /* sops on more than one sem_num */ }; /* Each task has a list of undo requests. They are executed automatically * when the process exits. */ struct sem_undo { struct list_head list_proc; /* per-process list: * * all undos from one process * rcu protected */ struct rcu_head rcu; /* rcu struct for sem_undo */ struct sem_undo_list *ulp; /* back ptr to sem_undo_list */ struct list_head list_id; /* per semaphore array list: * all undos for one array */ int semid; /* semaphore set identifier */ short semadj[]; /* array of adjustments */ /* one per semaphore */ }; /* sem_undo_list controls shared access to the list of sem_undo structures * that may be shared among all a CLONE_SYSVSEM task group. */ struct sem_undo_list { refcount_t refcnt; spinlock_t lock; struct list_head list_proc; }; #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) static int newary(struct ipc_namespace *, struct ipc_params *); static void freeary(struct ipc_namespace *, struct kern_ipc_perm *); #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #endif #define SEMMSL_FAST 256 /* 512 bytes on stack */ #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ /* * Switching from the mode suitable for simple ops * to the mode for complex ops is costly. Therefore: * use some hysteresis */ #define USE_GLOBAL_LOCK_HYSTERESIS 10 /* * Locking: * a) global sem_lock() for read/write * sem_undo.id_next, * sem_array.complex_count, * sem_array.pending{_alter,_const}, * sem_array.sem_undo * * b) global or semaphore sem_lock() for read/write: * sem_array.sems[i].pending_{const,alter}: * * c) special: * sem_undo_list.list_proc: * * undo_list->lock for write * * rcu for read * use_global_lock: * * global sem_lock() for write * * either local or global sem_lock() for read. * * Memory ordering: * Most ordering is enforced by using spin_lock() and spin_unlock(). * * Exceptions: * 1) use_global_lock: (SEM_BARRIER_1) * Setting it from non-zero to 0 is a RELEASE, this is ensured by * using smp_store_release(): Immediately after setting it to 0, * a simple op can start. * Testing if it is non-zero is an ACQUIRE, this is ensured by using * smp_load_acquire(). * Setting it from 0 to non-zero must be ordered with regards to * this smp_load_acquire(), this is guaranteed because the smp_load_acquire() * is inside a spin_lock() and after a write from 0 to non-zero a * spin_lock()+spin_unlock() is done. * To prevent the compiler/cpu temporarily writing 0 to use_global_lock, * READ_ONCE()/WRITE_ONCE() is used. * * 2) queue.status: (SEM_BARRIER_2) * Initialization is done while holding sem_lock(), so no further barrier is * required. * Setting it to a result code is a RELEASE, this is ensured by both a * smp_store_release() (for case a) and while holding sem_lock() * (for case b). * The ACQUIRE when reading the result code without holding sem_lock() is * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep(). * (case a above). * Reading the result code while holding sem_lock() needs no further barriers, * the locks inside sem_lock() enforce ordering (case b above) * * 3) current->state: * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock(). * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may * happen immediately after calling wake_q_add. As wake_q_add_safe() is called * when holding sem_lock(), no further barriers are required. * * See also ipc/mqueue.c for more details on the covered races. */ #define sc_semmsl sem_ctls[0] #define sc_semmns sem_ctls[1] #define sc_semopm sem_ctls[2] #define sc_semmni sem_ctls[3] void sem_init_ns(struct ipc_namespace *ns) { ns->sc_semmsl = SEMMSL; ns->sc_semmns = SEMMNS; ns->sc_semopm = SEMOPM; ns->sc_semmni = SEMMNI; ns->used_sems = 0; ipc_init_ids(&ns->ids[IPC_SEM_IDS]); } #ifdef CONFIG_IPC_NS void sem_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &sem_ids(ns), freeary); idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht); } #endif void __init sem_init(void) { sem_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/sem", " key semid perms nsems uid gid cuid cgid otime ctime\n", IPC_SEM_IDS, sysvipc_sem_proc_show); } /** * unmerge_queues - unmerge queues, if possible. * @sma: semaphore array * * The function unmerges the wait queues if complex_count is 0. * It must be called prior to dropping the global semaphore array lock. */ static void unmerge_queues(struct sem_array *sma) { struct sem_queue *q, *tq; /* complex operations still around? */ if (sma->complex_count) return; /* * We will switch back to simple mode. * Move all pending operation back into the per-semaphore * queues. */ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { struct sem *curr; curr = &sma->sems[q->sops[0].sem_num]; list_add_tail(&q->list, &curr->pending_alter); } INIT_LIST_HEAD(&sma->pending_alter); } /** * merge_queues - merge single semop queues into global queue * @sma: semaphore array * * This function merges all per-semaphore queues into the global queue. * It is necessary to achieve FIFO ordering for the pending single-sop * operations when a multi-semop operation must sleep. * Only the alter operations must be moved, the const operations can stay. */ static void merge_queues(struct sem_array *sma) { int i; for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = &sma->sems[i]; list_splice_init(&sem->pending_alter, &sma->pending_alter); } } static void sem_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); struct sem_array *sma = container_of(p, struct sem_array, sem_perm); security_sem_free(&sma->sem_perm); kvfree(sma); } /* * Enter the mode suitable for non-simple operations: * Caller must own sem_perm.lock. */ static void complexmode_enter(struct sem_array *sma) { int i; struct sem *sem; if (sma->use_global_lock > 0) { /* * We are already in global lock mode. * Nothing to do, just reset the * counter until we return to simple mode. */ WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); return; } WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); for (i = 0; i < sma->sem_nsems; i++) { sem = &sma->sems[i]; spin_lock(&sem->lock); spin_unlock(&sem->lock); } } /* * Try to leave the mode that disallows simple operations: * Caller must own sem_perm.lock. */ static void complexmode_tryleave(struct sem_array *sma) { if (sma->complex_count) { /* Complex ops are sleeping. * We must stay in complex mode */ return; } if (sma->use_global_lock == 1) { /* See SEM_BARRIER_1 for purpose/pairing */ smp_store_release(&sma->use_global_lock, 0); } else { WRITE_ONCE(sma->use_global_lock, sma->use_global_lock-1); } } #define SEM_GLOBAL_LOCK (-1) /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. * Otherwise, lock the entire semaphore array, since we either have * multiple semaphores in our own semops, or we need to look at * semaphores from other pending complex operations. */ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { struct sem *sem; int idx; if (nsops != 1) { /* Complex operation - acquire a full lock */ ipc_lock_object(&sma->sem_perm); /* Prevent parallel simple ops */ complexmode_enter(sma); return SEM_GLOBAL_LOCK; } /* * Only one semaphore affected - try to optimize locking. * Optimized locking is possible if no complex operation * is either enqueued or processed right now. * * Both facts are tracked by use_global_mode. */ idx = array_index_nospec(sops->sem_num, sma->sem_nsems); sem = &sma->sems[idx]; /* * Initial check for use_global_lock. Just an optimization, * no locking, no memory barrier. */ if (!READ_ONCE(sma->use_global_lock)) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. */ spin_lock(&sem->lock); /* see SEM_BARRIER_1 for purpose/pairing */ if (!smp_load_acquire(&sma->use_global_lock)) { /* fast path successful! */ return sops->sem_num; } spin_unlock(&sem->lock); } /* slow path: acquire the full lock */ ipc_lock_object(&sma->sem_perm); if (sma->use_global_lock == 0) { /* * The use_global_lock mode ended while we waited for * sma->sem_perm.lock. Thus we must switch to locking * with sem->lock. * Unlike in the fast path, there is no need to recheck * sma->use_global_lock after we have acquired sem->lock: * We own sma->sem_perm.lock, thus use_global_lock cannot * change. */ spin_lock(&sem->lock); ipc_unlock_object(&sma->sem_perm); return sops->sem_num; } else { /* * Not a false alarm, thus continue to use the global lock * mode. No need for complexmode_enter(), this was done by * the caller that has set use_global_mode to non-zero. */ return SEM_GLOBAL_LOCK; } } static inline void sem_unlock(struct sem_array *sma, int locknum) { if (locknum == SEM_GLOBAL_LOCK) { unmerge_queues(sma); complexmode_tryleave(sma); ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = &sma->sems[locknum]; spin_unlock(&sem->lock); } } /* * sem_lock_(check_) routines are called in the paths where the rwsem * is not held. * * The caller holds the RCU read lock. */ static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct sem_array, sem_perm); } static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&sem_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct sem_array, sem_perm); } static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) { ipc_rmid(&sem_ids(ns), &s->sem_perm); } static struct sem_array *sem_alloc(size_t nsems) { struct sem_array *sma; if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) return NULL; sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT); if (unlikely(!sma)) return NULL; return sma; } /** * newary - Create a new semaphore set * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems * * Called with sem_ids.rwsem held (as a writer) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) { int retval; struct sem_array *sma; key_t key = params->key; int nsems = params->u.nsems; int semflg = params->flg; int i; if (!nsems) return -EINVAL; if (ns->used_sems + nsems > ns->sc_semmns) return -ENOSPC; sma = sem_alloc(nsems); if (!sma) return -ENOMEM; sma->sem_perm.mode = (semflg & S_IRWXUGO); sma->sem_perm.key = key; sma->sem_perm.security = NULL; retval = security_sem_alloc(&sma->sem_perm); if (retval) { kvfree(sma); return retval; } for (i = 0; i < nsems; i++) { INIT_LIST_HEAD(&sma->sems[i].pending_alter); INIT_LIST_HEAD(&sma->sems[i].pending_const); spin_lock_init(&sma->sems[i].lock); } sma->complex_count = 0; sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; INIT_LIST_HEAD(&sma->pending_alter); INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); sma->sem_nsems = nsems; sma->sem_ctime = ktime_get_real_seconds(); /* ipc_addid() locks sma upon success. */ retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if (retval < 0) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return retval; } ns->used_sems += nsems; sem_unlock(sma, -1); rcu_read_unlock(); return sma->sem_perm.id; } /* * Called with sem_ids.rwsem and ipcp locked. */ static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) { struct sem_array *sma; sma = container_of(ipcp, struct sem_array, sem_perm); if (params->u.nsems > sma->sem_nsems) return -EINVAL; return 0; } long ksys_semget(key_t key, int nsems, int semflg) { struct ipc_namespace *ns; static const struct ipc_ops sem_ops = { .getnew = newary, .associate = security_sem_associate, .more_checks = sem_more_checks, }; struct ipc_params sem_params; ns = current->nsproxy->ipc_ns; if (nsems < 0 || nsems > ns->sc_semmsl) return -EINVAL; sem_params.key = key; sem_params.flg = semflg; sem_params.u.nsems = nsems; return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) { return ksys_semget(key, nsems, semflg); } /** * perform_atomic_semop[_slow] - Attempt to perform semaphore * operations on a given array. * @sma: semaphore array * @q: struct sem_queue that describes the operation * * Caller blocking are as follows, based the value * indicated by the semaphore operation (sem_op): * * (1) >0 never blocks. * (2) 0 (wait-for-zero operation): semval is non-zero. * (3) <0 attempting to decrement semval to a value smaller than zero. * * Returns 0 if the operation was possible. * Returns 1 if the operation is impossible, the caller must sleep. * Returns <0 for error codes. */ static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops; struct pid *pid; struct sembuf *sop; struct sem *curr; struct sembuf *sops; struct sem_undo *un; sops = q->sops; nsops = q->nsops; un = q->undo; for (sop = sops; sop < sops + nsops; sop++) { int idx = array_index_nospec(sop->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; sem_op = sop->sem_op; result = curr->semval; if (!sem_op && result) goto would_block; result += sem_op; if (result < 0) goto would_block; if (result > SEMVMX) goto out_of_range; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; /* Exceeding the undo range is an error. */ if (undo < (-SEMAEM - 1) || undo > SEMAEM) goto out_of_range; un->semadj[sop->sem_num] = undo; } curr->semval = result; } sop--; pid = q->pid; while (sop >= sops) { ipc_update_pid(&sma->sems[sop->sem_num].sempid, pid); sop--; } return 0; out_of_range: result = -ERANGE; goto undo; would_block: q->blocking = sop; if (sop->sem_flg & IPC_NOWAIT) result = -EAGAIN; else result = 1; undo: sop--; while (sop >= sops) { sem_op = sop->sem_op; sma->sems[sop->sem_num].semval -= sem_op; if (sop->sem_flg & SEM_UNDO) un->semadj[sop->sem_num] += sem_op; sop--; } return result; } static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) { int result, sem_op, nsops; struct sembuf *sop; struct sem *curr; struct sembuf *sops; struct sem_undo *un; sops = q->sops; nsops = q->nsops; un = q->undo; if (unlikely(q->dupsop)) return perform_atomic_semop_slow(sma, q); /* * We scan the semaphore set twice, first to ensure that the entire * operation can succeed, therefore avoiding any pointless writes * to shared memory and having to undo such changes in order to block * until the operations can go through. */ for (sop = sops; sop < sops + nsops; sop++) { int idx = array_index_nospec(sop->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; sem_op = sop->sem_op; result = curr->semval; if (!sem_op && result) goto would_block; /* wait-for-zero */ result += sem_op; if (result < 0) goto would_block; if (result > SEMVMX) return -ERANGE; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; /* Exceeding the undo range is an error. */ if (undo < (-SEMAEM - 1) || undo > SEMAEM) return -ERANGE; } } for (sop = sops; sop < sops + nsops; sop++) { curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; un->semadj[sop->sem_num] = undo; } curr->semval += sem_op; ipc_update_pid(&curr->sempid, q->pid); } return 0; would_block: q->blocking = sop; return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1; } static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, struct wake_q_head *wake_q) { struct task_struct *sleeper; sleeper = get_task_struct(q->sleeper); /* see SEM_BARRIER_2 for purpose/pairing */ smp_store_release(&q->status, error); wake_q_add_safe(wake_q, sleeper); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) { list_del(&q->list); if (q->nsops > 1) sma->complex_count--; } /** check_restart(sma, q) * @sma: semaphore array * @q: the operation that just completed * * update_queue is O(N^2) when it restarts scanning the whole queue of * waiting operations. Therefore this function checks if the restart is * really necessary. It is called after a previously waiting operation * modified the array. * Note that wait-for-zero operations are handled without restart. */ static inline int check_restart(struct sem_array *sma, struct sem_queue *q) { /* pending complex alter operations are too difficult to analyse */ if (!list_empty(&sma->pending_alter)) return 1; /* we were a sleeping complex operation. Too difficult */ if (q->nsops > 1) return 1; /* It is impossible that someone waits for the new value: * - complex operations always restart. * - wait-for-zero are handled separately. * - q is a previously sleeping simple operation that * altered the array. It must be a decrement, because * simple increments never sleep. * - If there are older (higher priority) decrements * in the queue, then they have observed the original * semval value and couldn't proceed. The operation * decremented to value - thus they won't proceed either. */ return 0; } /** * wake_const_ops - wake up non-alter tasks * @sma: semaphore array. * @semnum: semaphore that was modified. * @wake_q: lockless wake-queue head. * * wake_const_ops must be called after a semaphore in a semaphore array * was set to 0. If complex const operations are pending, wake_const_ops must * be called with semnum = -1, as well as with the number of each modified * semaphore. * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function returns 1 if at least one operation was completed successfully. */ static int wake_const_ops(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; if (semnum == -1) pending_list = &sma->pending_const; else pending_list = &sma->sems[semnum].pending_const; list_for_each_entry_safe(q, tmp, pending_list, list) { int error = perform_atomic_semop(sma, q); if (error > 0) continue; /* operation completed, remove from queue & wakeup */ unlink_queue(sma, q); wake_up_sem_queue_prepare(q, error, wake_q); if (error == 0) semop_completed = 1; } return semop_completed; } /** * do_smart_wakeup_zero - wakeup all wait for zero tasks * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations * @wake_q: lockless wake-queue head * * Checks all required queue for wait-for-zero operations, based * on the actual changes that were performed on the semaphore array. * The function returns 1 if at least one operation was completed successfully. */ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, int nsops, struct wake_q_head *wake_q) { int i; int semop_completed = 0; int got_zero = 0; /* first: the per-semaphore queues, if known */ if (sops) { for (i = 0; i < nsops; i++) { int num = sops[i].sem_num; if (sma->sems[num].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, num, wake_q); } } } else { /* * No sops means modified semaphores not known. * Assume all were changed. */ for (i = 0; i < sma->sem_nsems; i++) { if (sma->sems[i].semval == 0) { got_zero = 1; semop_completed |= wake_const_ops(sma, i, wake_q); } } } /* * If one of the modified semaphores got 0, * then check the global queue, too. */ if (got_zero) semop_completed |= wake_const_ops(sma, -1, wake_q); return semop_completed; } /** * update_queue - look for tasks that can be completed. * @sma: semaphore array. * @semnum: semaphore that was modified. * @wake_q: lockless wake-queue head. * * update_queue must be called after a semaphore in a semaphore array * was modified. If multiple semaphores were modified, update_queue must * be called with semnum = -1, as well as with the number of each modified * semaphore. * The tasks that must be woken up are added to @wake_q. The return code * is stored in q->pid. * The function internally checks if const operations can now succeed. * * The function return 1 if at least one semop was completed successfully. */ static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) { struct sem_queue *q, *tmp; struct list_head *pending_list; int semop_completed = 0; if (semnum == -1) pending_list = &sma->pending_alter; else pending_list = &sma->sems[semnum].pending_alter; again: list_for_each_entry_safe(q, tmp, pending_list, list) { int error, restart; /* If we are scanning the single sop, per-semaphore list of * one semaphore and that semaphore is 0, then it is not * necessary to scan further: simple increments * that affect only one entry succeed immediately and cannot * be in the per semaphore pending queue, and decrements * cannot be successful if the value is already 0. */ if (semnum != -1 && sma->sems[semnum].semval == 0) break; error = perform_atomic_semop(sma, q); /* Does q->sleeper still need to sleep? */ if (error > 0) continue; unlink_queue(sma, q); if (error) { restart = 0; } else { semop_completed = 1; do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); restart = check_restart(sma, q); } wake_up_sem_queue_prepare(q, error, wake_q); if (restart) goto again; } return semop_completed; } /** * set_semotime - set sem_otime * @sma: semaphore array * @sops: operations that modified the array, may be NULL * * sem_otime is replicated to avoid cache line trashing. * This function sets one instance to the current time. */ static void set_semotime(struct sem_array *sma, struct sembuf *sops) { if (sops == NULL) { sma->sems[0].sem_otime = ktime_get_real_seconds(); } else { sma->sems[sops[0].sem_num].sem_otime = ktime_get_real_seconds(); } } /** * do_smart_update - optimized update_queue * @sma: semaphore array * @sops: operations that were performed * @nsops: number of operations * @otime: force setting otime * @wake_q: lockless wake-queue head * * do_smart_update() does the required calls to update_queue and wakeup_zero, * based on the actual changes that were performed on the semaphore array. * Note that the function does not do the actual wake-up: the caller is * responsible for calling wake_up_q(). * It is safe to perform this call after dropping all locks. */ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, int otime, struct wake_q_head *wake_q) { int i; otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); if (!list_empty(&sma->pending_alter)) { /* semaphore array uses the global queue - just process it. */ otime |= update_queue(sma, -1, wake_q); } else { if (!sops) { /* * No sops, thus the modified semaphores are not * known. Check all. */ for (i = 0; i < sma->sem_nsems; i++) otime |= update_queue(sma, i, wake_q); } else { /* * Check the semaphores that were increased: * - No complex ops, thus all sleeping ops are * decrease. * - if we decreased the value, then any sleeping * semaphore ops won't be able to run: If the * previous value was too small, then the new * value will be too small, too. */ for (i = 0; i < nsops; i++) { if (sops[i].sem_op > 0) { otime |= update_queue(sma, sops[i].sem_num, wake_q); } } } } if (otime) set_semotime(sma, sops); } /* * check_qop: Test if a queued operation sleeps on the semaphore semnum */ static int check_qop(struct sem_array *sma, int semnum, struct sem_queue *q, bool count_zero) { struct sembuf *sop = q->blocking; /* * Linux always (since 0.99.10) reported a task as sleeping on all * semaphores. This violates SUS, therefore it was changed to the * standard compliant behavior. * Give the administrators a chance to notice that an application * might misbehave because it relies on the Linux behavior. */ pr_info_once("semctl(GETNCNT/GETZCNT) is since 3.16 Single Unix Specification compliant.\n" "The task %s (%d) triggered the difference, watch for misbehavior.\n", current->comm, task_pid_nr(current)); if (sop->sem_num != semnum) return 0; if (count_zero && sop->sem_op == 0) return 1; if (!count_zero && sop->sem_op < 0) return 1; return 0; } /* The following counts are associated to each semaphore: * semncnt number of tasks waiting on semval being nonzero * semzcnt number of tasks waiting on semval being zero * * Per definition, a task waits only on the semaphore of the first semop * that cannot proceed, even if additional operation would block, too. */ static int count_semcnt(struct sem_array *sma, ushort semnum, bool count_zero) { struct list_head *l; struct sem_queue *q; int semcnt; semcnt = 0; /* First: check the simple operations. They are easy to evaluate */ if (count_zero) l = &sma->sems[semnum].pending_const; else l = &sma->sems[semnum].pending_alter; list_for_each_entry(q, l, list) { /* all task on a per-semaphore list sleep on exactly * that semaphore */ semcnt++; } /* Then: check the complex operations. */ list_for_each_entry(q, &sma->pending_alter, list) { semcnt += check_qop(sma, semnum, q, count_zero); } if (count_zero) { list_for_each_entry(q, &sma->pending_const, list) { semcnt += check_qop(sma, semnum, q, count_zero); } } return semcnt; } /* Free a semaphore set. freeary() is called with sem_ids.rwsem locked * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem * remains locked on exit. */ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct sem_undo *un, *tu; struct sem_queue *q, *tq; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); int i; DEFINE_WAKE_Q(wake_q); /* Free the existing undo structures for this semaphore set. */ ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry_safe(un, tu, &sma->list_id, list_id) { list_del(&un->list_id); spin_lock(&un->ulp->lock); un->semid = -1; list_del_rcu(&un->list_proc); spin_unlock(&un->ulp->lock); kvfree_rcu(un, rcu); } /* Wake up all pending processes and let them fail with EIDRM. */ list_for_each_entry_safe(q, tq, &sma->pending_const, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = &sma->sems[i]; list_for_each_entry_safe(q, tq, &sem->pending_const, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); } ipc_update_pid(&sem->sempid, NULL); } /* Remove the semaphore set from the IDR */ sem_rmid(ns, sma); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); ns->used_sems -= sma->sem_nsems; ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); } static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct semid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->sem_perm, &out.sem_perm); out.sem_otime = in->sem_otime; out.sem_ctime = in->sem_ctime; out.sem_nsems = in->sem_nsems; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static time64_t get_semotime(struct sem_array *sma) { int i; time64_t res; res = sma->sems[0].sem_otime; for (i = 1; i < sma->sem_nsems; i++) { time64_t to = sma->sems[i].sem_otime; if (to > res) res = to; } return res; } static int semctl_stat(struct ipc_namespace *ns, int semid, int cmd, struct semid64_ds *semid64) { struct sem_array *sma; time64_t semotime; int err; memset(semid64, 0, sizeof(*semid64)); rcu_read_lock(); if (cmd == SEM_STAT || cmd == SEM_STAT_ANY) { sma = sem_obtain_object(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } } else { /* IPC_STAT */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { err = PTR_ERR(sma); goto out_unlock; } } /* see comment for SHM_STAT_ANY */ if (cmd == SEM_STAT_ANY) audit_ipc_obj(&sma->sem_perm); else { err = -EACCES; if (ipcperms(ns, &sma->sem_perm, S_IRUGO)) goto out_unlock; } err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&sma->sem_perm); if (!ipc_valid_object(&sma->sem_perm)) { ipc_unlock_object(&sma->sem_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&sma->sem_perm, &semid64->sem_perm); semotime = get_semotime(sma); semid64->sem_otime = semotime; semid64->sem_ctime = sma->sem_ctime; #ifndef CONFIG_64BIT semid64->sem_otime_high = semotime >> 32; semid64->sem_ctime_high = sma->sem_ctime >> 32; #endif semid64->sem_nsems = sma->sem_nsems; if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * SEM_STAT and SEM_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = sma->sem_perm.id; } ipc_unlock_object(&sma->sem_perm); out_unlock: rcu_read_unlock(); return err; } static int semctl_info(struct ipc_namespace *ns, int semid, int cmd, void __user *p) { struct seminfo seminfo; int max_idx; int err; err = security_sem_semctl(NULL, cmd); if (err) return err; memset(&seminfo, 0, sizeof(seminfo)); seminfo.semmni = ns->sc_semmni; seminfo.semmns = ns->sc_semmns; seminfo.semmsl = ns->sc_semmsl; seminfo.semopm = ns->sc_semopm; seminfo.semvmx = SEMVMX; seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; down_read(&sem_ids(ns).rwsem); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; } else { seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } max_idx = ipc_get_maxidx(&sem_ids(ns)); up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_idx < 0) ? 0 : max_idx; } static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, int val) { struct sem_undo *un; struct sem_array *sma; struct sem *curr; int err; DEFINE_WAKE_Q(wake_q); if (val > SEMVMX || val < 0) return -ERANGE; rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return PTR_ERR(sma); } if (semnum < 0 || semnum >= sma->sem_nsems) { rcu_read_unlock(); return -EINVAL; } if (ipcperms(ns, &sma->sem_perm, S_IWUGO)) { rcu_read_unlock(); return -EACCES; } err = security_sem_semctl(&sma->sem_perm, SETVAL); if (err) { rcu_read_unlock(); return -EACCES; } sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); return -EIDRM; } semnum = array_index_nospec(semnum, sma->sem_nsems); curr = &sma->sems[semnum]; ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) un->semadj[semnum] = 0; curr->semval = val; ipc_update_pid(&curr->sempid, task_tgid(current)); sma->sem_ctime = ktime_get_real_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); return 0; } static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int cmd, void __user *p) { struct sem_array *sma; struct sem *curr; int err, nsems; ushort fast_sem_io[SEMMSL_FAST]; ushort *sem_io = fast_sem_io; DEFINE_WAKE_Q(wake_q); rcu_read_lock(); sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return PTR_ERR(sma); } nsems = sma->sem_nsems; err = -EACCES; if (ipcperms(ns, &sma->sem_perm, cmd == SETALL ? S_IWUGO : S_IRUGO)) goto out_rcu_wakeup; err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_rcu_wakeup; switch (cmd) { case GETALL: { ushort __user *array = p; int i; sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } if (nsems > SEMMSL_FAST) { if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } sem_unlock(sma, -1); rcu_read_unlock(); sem_io = kvmalloc_array(nsems, sizeof(ushort), GFP_KERNEL); if (sem_io == NULL) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } } for (i = 0; i < sma->sem_nsems; i++) sem_io[i] = sma->sems[i].semval; sem_unlock(sma, -1); rcu_read_unlock(); err = 0; if (copy_to_user(array, sem_io, nsems*sizeof(ushort))) err = -EFAULT; goto out_free; } case SETALL: { int i; struct sem_undo *un; if (!ipc_rcu_getref(&sma->sem_perm)) { err = -EIDRM; goto out_rcu_wakeup; } rcu_read_unlock(); if (nsems > SEMMSL_FAST) { sem_io = kvmalloc_array(nsems, sizeof(ushort), GFP_KERNEL); if (sem_io == NULL) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return -ENOMEM; } } if (copy_from_user(sem_io, p, nsems*sizeof(ushort))) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); err = -ERANGE; goto out_free; } } rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } for (i = 0; i < nsems; i++) { sma->sems[i].semval = sem_io[i]; ipc_update_pid(&sma->sems[i].sempid, task_tgid(current)); } ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) { for (i = 0; i < nsems; i++) un->semadj[i] = 0; } sma->sem_ctime = ktime_get_real_seconds(); /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 0, &wake_q); err = 0; goto out_unlock; } /* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */ } err = -EINVAL; if (semnum < 0 || semnum >= nsems) goto out_rcu_wakeup; sem_lock(sma, NULL, -1); if (!ipc_valid_object(&sma->sem_perm)) { err = -EIDRM; goto out_unlock; } semnum = array_index_nospec(semnum, nsems); curr = &sma->sems[semnum]; switch (cmd) { case GETVAL: err = curr->semval; goto out_unlock; case GETPID: err = pid_vnr(curr->sempid); goto out_unlock; case GETNCNT: err = count_semcnt(sma, semnum, 0); goto out_unlock; case GETZCNT: err = count_semcnt(sma, semnum, 1); goto out_unlock; } out_unlock: sem_unlock(sma, -1); out_rcu_wakeup: rcu_read_unlock(); wake_up_q(&wake_q); out_free: if (sem_io != fast_sem_io) kvfree(sem_io); return err; } static inline unsigned long copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct semid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->sem_perm.uid = tbuf_old.sem_perm.uid; out->sem_perm.gid = tbuf_old.sem_perm.gid; out->sem_perm.mode = tbuf_old.sem_perm.mode; return 0; } default: return -EINVAL; } } /* * This function handles some semctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int semctl_down(struct ipc_namespace *ns, int semid, int cmd, struct semid64_ds *semid64) { struct sem_array *sma; int err; struct kern_ipc_perm *ipcp; down_write(&sem_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd, &semid64->sem_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } sma = container_of(ipcp, struct sem_array, sem_perm); err = security_sem_semctl(&sma->sem_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: sem_lock(sma, NULL, -1); /* freeary unlocks the ipc object and rcu */ freeary(ns, ipcp); goto out_up; case IPC_SET: sem_lock(sma, NULL, -1); err = ipc_update_perm(&semid64->sem_perm, ipcp); if (err) goto out_unlock0; sma->sem_ctime = ktime_get_real_seconds(); break; default: err = -EINVAL; goto out_unlock1; } out_unlock0: sem_unlock(sma, -1); out_unlock1: rcu_read_unlock(); out_up: up_write(&sem_ids(ns).rwsem); return err; } static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version) { struct ipc_namespace *ns; void __user *p = (void __user *)arg; struct semid64_ds semid64; int err; if (semid < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: case SEM_INFO: return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; if (copy_semid_to_user(p, &semid64, version)) err = -EFAULT; return err; case GETALL: case GETVAL: case GETPID: case GETNCNT: case GETZCNT: case SETALL: return semctl_main(ns, semid, semnum, cmd, p); case SETVAL: { int val; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) /* big-endian 64bit */ val = arg >> 32; #else /* 32bit or little-endian 64bit */ val = arg; #endif return semctl_setval(ns, semid, semnum, val); } case IPC_SET: if (copy_semid_from_user(&semid64, p, version)) return -EFAULT; fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: return -EINVAL; } } SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) { return ksys_semctl(semid, semnum, cmd, arg, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg) { int version = ipc_parse_version(&cmd); return ksys_semctl(semid, semnum, cmd, arg, version); } SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) { return ksys_old_semctl(semid, semnum, cmd, arg); } #endif #ifdef CONFIG_COMPAT struct compat_semid_ds { struct compat_ipc_perm sem_perm; old_time32_t sem_otime; old_time32_t sem_ctime; compat_uptr_t sem_base; compat_uptr_t sem_pending; compat_uptr_t sem_pending_last; compat_uptr_t undo; unsigned short sem_nsems; }; static int copy_compat_semid_from_user(struct semid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_semid64_ds __user *p = buf; return get_compat_ipc64_perm(&out->sem_perm, &p->sem_perm); } else { struct compat_semid_ds __user *p = buf; return get_compat_ipc_perm(&out->sem_perm, &p->sem_perm); } } static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in, int version) { if (version == IPC_64) { struct compat_semid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.sem_perm, &in->sem_perm); v.sem_otime = lower_32_bits(in->sem_otime); v.sem_otime_high = upper_32_bits(in->sem_otime); v.sem_ctime = lower_32_bits(in->sem_ctime); v.sem_ctime_high = upper_32_bits(in->sem_ctime); v.sem_nsems = in->sem_nsems; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_semid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.sem_perm, &in->sem_perm); v.sem_otime = in->sem_otime; v.sem_ctime = in->sem_ctime; v.sem_nsems = in->sem_nsems; return copy_to_user(buf, &v, sizeof(v)); } } static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version) { void __user *p = compat_ptr(arg); struct ipc_namespace *ns; struct semid64_ds semid64; int err; ns = current->nsproxy->ipc_ns; if (semid < 0) return -EINVAL; switch (cmd & (~IPC_64)) { case IPC_INFO: case SEM_INFO: return semctl_info(ns, semid, cmd, p); case IPC_STAT: case SEM_STAT: case SEM_STAT_ANY: err = semctl_stat(ns, semid, cmd, &semid64); if (err < 0) return err; if (copy_compat_semid_to_user(p, &semid64, version)) err = -EFAULT; return err; case GETVAL: case GETPID: case GETNCNT: case GETZCNT: case GETALL: case SETALL: return semctl_main(ns, semid, semnum, cmd, p); case SETVAL: return semctl_setval(ns, semid, semnum, arg); case IPC_SET: if (copy_compat_semid_from_user(&semid64, p, version)) return -EFAULT; fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: return -EINVAL; } } COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg) { return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_semctl(semid, semnum, cmd, arg, version); } COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg) { return compat_ksys_old_semctl(semid, semnum, cmd, arg); } #endif #endif /* If the task doesn't already have a undo_list, then allocate one * here. We guarantee there is only one thread using this undo list, * and current is THE ONE * * If this allocation and assignment succeeds, but later * portions of this code fail, there is no need to free the sem_undo_list. * Just let it stay associated with the task, and it'll be freed later * at exit time. * * This can block, so callers must hold no locks. */ static inline int get_undo_list(struct sem_undo_list **undo_listp) { struct sem_undo_list *undo_list; undo_list = current->sysvsem.undo_list; if (!undo_list) { undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); refcount_set(&undo_list->refcnt, 1); INIT_LIST_HEAD(&undo_list->list_proc); current->sysvsem.undo_list = undo_list; } *undo_listp = undo_list; return 0; } static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } return NULL; } static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; assert_spin_locked(&ulp->lock); un = __lookup_undo(ulp, semid); if (un) { list_del_rcu(&un->list_proc); list_add_rcu(&un->list_proc, &ulp->list_proc); } return un; } /** * find_alloc_undo - lookup (and if not present create) undo array * @ns: namespace * @semid: semaphore array id * * The function looks up (and if not present creates) the undo structure. * The size of the undo structure depends on the size of the semaphore * array, thus the alloc path is not that straightforward. * Lifetime-rules: sem_undo is rcu-protected, on success, the function * performs a rcu_read_lock(). */ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) { struct sem_array *sma; struct sem_undo_list *ulp; struct sem_undo *un, *new; int nsems, error; error = get_undo_list(&ulp); if (error) return ERR_PTR(error); rcu_read_lock(); spin_lock(&ulp->lock); un = lookup_undo(ulp, semid); spin_unlock(&ulp->lock); if (likely(un != NULL)) goto out; /* no undo structure around - allocate one. */ /* step 1: figure out the size of the semaphore array */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); return ERR_CAST(sma); } nsems = sma->sem_nsems; if (!ipc_rcu_getref(&sma->sem_perm)) { rcu_read_unlock(); un = ERR_PTR(-EIDRM); goto out; } rcu_read_unlock(); /* step 2: allocate new undo structure */ new = kvzalloc(struct_size(new, semadj, nsems), GFP_KERNEL_ACCOUNT); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); } /* step 3: Acquire the lock on semaphore array */ rcu_read_lock(); sem_lock_and_putref(sma); if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); kvfree(new); un = ERR_PTR(-EIDRM); goto out; } spin_lock(&ulp->lock); /* * step 4: check for races: did someone else allocate the undo struct? */ un = lookup_undo(ulp, semid); if (un) { spin_unlock(&ulp->lock); kvfree(new); goto success; } /* step 5: initialize & link new undo structure */ new->ulp = ulp; new->semid = semid; assert_spin_locked(&ulp->lock); list_add_rcu(&new->list_proc, &ulp->list_proc); ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; spin_unlock(&ulp->lock); success: sem_unlock(sma, -1); out: return un; } long __do_semtimedop(int semid, struct sembuf *sops, unsigned nsops, const struct timespec64 *timeout, struct ipc_namespace *ns) { int error = -EINVAL; struct sem_array *sma; struct sembuf *sop; struct sem_undo *un; int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; unsigned long dup = 0; ktime_t expires, *exp = NULL; bool timed_out = false; if (nsops < 1 || semid < 0) return -EINVAL; if (nsops > ns->sc_semopm) return -E2BIG; if (timeout) { if (!timespec64_valid(timeout)) return -EINVAL; expires = ktime_add_safe(ktime_get(), timespec64_to_ktime(*timeout)); exp = &expires; } max = 0; for (sop = sops; sop < sops + nsops; sop++) { unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); if (sop->sem_num >= max) max = sop->sem_num; if (sop->sem_flg & SEM_UNDO) undos = true; if (dup & mask) { /* * There was a previous alter access that appears * to have accessed the same semaphore, thus use * the dupsop logic. "appears", because the detection * can only check % BITS_PER_LONG. */ dupsop = true; } if (sop->sem_op != 0) { alter = true; dup |= mask; } } if (undos) { /* On success, find_alloc_undo takes the rcu_read_lock */ un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); goto out; } } else { un = NULL; rcu_read_lock(); } sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { rcu_read_unlock(); error = PTR_ERR(sma); goto out; } error = -EFBIG; if (max >= sma->sem_nsems) { rcu_read_unlock(); goto out; } error = -EACCES; if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { rcu_read_unlock(); goto out; } error = security_sem_semop(&sma->sem_perm, sops, nsops, alter); if (error) { rcu_read_unlock(); goto out; } error = -EIDRM; locknum = sem_lock(sma, sops, nsops); /* * We eventually might perform the following check in a lockless * fashion, considering ipc_valid_object() locking constraints. * If nsops == 1 and there is no contention for sem_perm.lock, then * only a per-semaphore lock is held and it's OK to proceed with the * check below. More details on the fine grained locking scheme * entangled here and why it's RMID race safe on comments at sem_lock() */ if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID * and now a new array with received the same id. Check and fail. * This case can be detected checking un->semid. The existence of * "un" itself is guaranteed by rcu. */ if (un && un->semid == -1) goto out_unlock; queue.sops = sops; queue.nsops = nsops; queue.undo = un; queue.pid = task_tgid(current); queue.alter = alter; queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); if (error == 0) { /* non-blocking successful path */ DEFINE_WAKE_Q(wake_q); /* * If the operation was successful, then do * the required updates. */ if (alter) do_smart_update(sma, sops, nsops, 1, &wake_q); else set_semotime(sma, sops); sem_unlock(sma, locknum); rcu_read_unlock(); wake_up_q(&wake_q); goto out; } if (error < 0) /* non-blocking error path */ goto out_unlock; /* * We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. */ if (nsops == 1) { struct sem *curr; int idx = array_index_nospec(sops->sem_num, sma->sem_nsems); curr = &sma->sems[idx]; if (alter) { if (sma->complex_count) { list_add_tail(&queue.list, &sma->pending_alter); } else { list_add_tail(&queue.list, &curr->pending_alter); } } else { list_add_tail(&queue.list, &curr->pending_const); } } else { if (!sma->complex_count) merge_queues(sma); if (alter) list_add_tail(&queue.list, &sma->pending_alter); else list_add_tail(&queue.list, &sma->pending_const); sma->complex_count++; } do { /* memory ordering ensured by the lock in sem_lock() */ WRITE_ONCE(queue.status, -EINTR); queue.sleeper = current; /* memory ordering is ensured by the lock in sem_lock() */ __set_current_state(TASK_INTERRUPTIBLE); sem_unlock(sma, locknum); rcu_read_unlock(); timed_out = !schedule_hrtimeout_range(exp, current->timer_slack_ns, HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or * not, from the syscall pov, is quite irrelevant to us at this * point; we're done. * * We _do_ care, nonetheless, about being awoken by a signal or * spuriously. The queue.status is checked again in the * slowpath (aka after taking sem_lock), such that we can detect * scenarios where we were awakened externally, during the * window between wake_q_add() and wake_up_q(). */ rcu_read_lock(); error = READ_ONCE(queue.status); if (error != -EINTR) { /* see SEM_BARRIER_2 for purpose/pairing */ smp_acquire__after_ctrl_dep(); rcu_read_unlock(); goto out; } locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) goto out_unlock; /* * No necessity for any barrier: We are protect by sem_lock() */ error = READ_ONCE(queue.status); /* * If queue.status != -EINTR we are woken up by another process. * Leave without unlink_queue(), but with sem_unlock(). */ if (error != -EINTR) goto out_unlock; /* * If an interrupt occurred we have to clean up the queue. */ if (timed_out) error = -EAGAIN; } while (error == -EINTR && !signal_pending(current)); /* spurious */ unlink_queue(sma, &queue); out_unlock: sem_unlock(sma, locknum); rcu_read_unlock(); out: return error; } static long do_semtimedop(int semid, struct sembuf __user *tsops, unsigned nsops, const struct timespec64 *timeout) { struct sembuf fast_sops[SEMOPM_FAST]; struct sembuf *sops = fast_sops; struct ipc_namespace *ns; int ret; ns = current->nsproxy->ipc_ns; if (nsops > ns->sc_semopm) return -E2BIG; if (nsops < 1) return -EINVAL; if (nsops > SEMOPM_FAST) { sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); if (sops == NULL) return -ENOMEM; } if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { ret = -EFAULT; goto out_free; } ret = __do_semtimedop(semid, sops, nsops, timeout, ns); out_free: if (sops != fast_sops) kvfree(sops); return ret; } long ksys_semtimedop(int semid, struct sembuf __user *tsops, unsigned int nsops, const struct __kernel_timespec __user *timeout) { if (timeout) { struct timespec64 ts; if (get_timespec64(&ts, timeout)) return -EFAULT; return do_semtimedop(semid, tsops, nsops, &ts); } return do_semtimedop(semid, tsops, nsops, NULL); } SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned int, nsops, const struct __kernel_timespec __user *, timeout) { return ksys_semtimedop(semid, tsops, nsops, timeout); } #ifdef CONFIG_COMPAT_32BIT_TIME long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout) { if (timeout) { struct timespec64 ts; if (get_old_timespec32(&ts, timeout)) return -EFAULT; return do_semtimedop(semid, tsems, nsops, &ts); } return do_semtimedop(semid, tsems, nsops, NULL); } SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems, unsigned int, nsops, const struct old_timespec32 __user *, timeout) { return compat_ksys_semtimedop(semid, tsems, nsops, timeout); } #endif SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, unsigned, nsops) { return do_semtimedop(semid, tsops, nsops, NULL); } /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between * parent and child tasks. */ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; if (clone_flags & CLONE_SYSVSEM) { error = get_undo_list(&undo_list); if (error) return error; refcount_inc(&undo_list->refcnt); tsk->sysvsem.undo_list = undo_list; } else tsk->sysvsem.undo_list = NULL; return 0; } /* * add semadj values to semaphores, free undo structures. * undo structures are not freed when semaphore arrays are destroyed * so some of them may be out of date. * IMPLEMENTATION NOTE: There is some confusion over whether the * set of adjustments that needs to be done should be done in an atomic * manner or not. That is, if we are attempting to decrement the semval * should we queue up and wait until we can do so legally? * The original implementation attempted to do this (queue and wait). * The current implementation does not do so. The POSIX standard * and SVID should be consulted to determine what behavior is mandated. */ void exit_sem(struct task_struct *tsk) { struct sem_undo_list *ulp; ulp = tsk->sysvsem.undo_list; if (!ulp) return; tsk->sysvsem.undo_list = NULL; if (!refcount_dec_and_test(&ulp->refcnt)) return; for (;;) { struct sem_array *sma; struct sem_undo *un; int semid, i; DEFINE_WAKE_Q(wake_q); cond_resched(); rcu_read_lock(); un = list_entry_rcu(ulp->list_proc.next, struct sem_undo, list_proc); if (&un->list_proc == &ulp->list_proc) { /* * We must wait for freeary() before freeing this ulp, * in case we raced with last sem_undo. There is a small * possibility where we exit while freeary() didn't * finish unlocking sem_undo_list. */ spin_lock(&ulp->lock); spin_unlock(&ulp->lock); rcu_read_unlock(); break; } spin_lock(&ulp->lock); semid = un->semid; spin_unlock(&ulp->lock); /* exit_sem raced with IPC_RMID, nothing to do */ if (semid == -1) { rcu_read_unlock(); continue; } sma = sem_obtain_object_check(tsk->nsproxy->ipc_ns, semid); /* exit_sem raced with IPC_RMID, nothing to do */ if (IS_ERR(sma)) { rcu_read_unlock(); continue; } sem_lock(sma, NULL, -1); /* exit_sem raced with IPC_RMID, nothing to do */ if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); continue; } un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created * exactly the same semid. Nothing to do. */ sem_unlock(sma, -1); rcu_read_unlock(); continue; } /* remove un from the linked lists */ ipc_assert_locked_object(&sma->sem_perm); list_del(&un->list_id); spin_lock(&ulp->lock); list_del_rcu(&un->list_proc); spin_unlock(&ulp->lock); /* perform adjustments registered in un */ for (i = 0; i < sma->sem_nsems; i++) { struct sem *semaphore = &sma->sems[i]; if (un->semadj[i]) { semaphore->semval += un->semadj[i]; /* * Range checks of the new semaphore value, * not defined by sus: * - Some unices ignore the undo entirely * (e.g. HP UX 11i 11.22, Tru64 V5.1) * - some cap the value (e.g. FreeBSD caps * at 0, but doesn't enforce SEMVMX) * * Linux caps the semaphore value, both at 0 * and at SEMVMX. * * Manfred <manfred@colorfullife.com> */ if (semaphore->semval < 0) semaphore->semval = 0; if (semaphore->semval > SEMVMX) semaphore->semval = SEMVMX; ipc_update_pid(&semaphore->sempid, task_tgid(current)); } } /* maybe some queued-up processes were waiting for this */ do_smart_update(sma, NULL, 0, 1, &wake_q); sem_unlock(sma, -1); rcu_read_unlock(); wake_up_q(&wake_q); kvfree_rcu(un, rcu); } kfree(ulp); } #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it) { struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); time64_t sem_otime; /* * The proc interface isn't aware of sem_lock(), it calls * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock). * (in sysvipc_find_ipc) * In order to stay compatible with sem_lock(), we must * enter / leave complex_mode. */ complexmode_enter(sma); sem_otime = get_semotime(sma); seq_printf(s, "%10d %10d %4o %10u %5u %5u %5u %5u %10llu %10llu\n", sma->sem_perm.key, sma->sem_perm.id, sma->sem_perm.mode, sma->sem_nsems, from_kuid_munged(user_ns, sma->sem_perm.uid), from_kgid_munged(user_ns, sma->sem_perm.gid), from_kuid_munged(user_ns, sma->sem_perm.cuid), from_kgid_munged(user_ns, sma->sem_perm.cgid), sem_otime, sma->sem_ctime); complexmode_tryleave(sma); return 0; } #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0 /* * Hibernation support for x86 * * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> #include <linux/scatterlist.h> #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/pgtable.h> #include <linux/types.h> #include <linux/crc32.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/proto.h> #include <asm/page.h> #include <asm/mtrr.h> #include <asm/sections.h> #include <asm/suspend.h> #include <asm/tlbflush.h> /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ unsigned long restore_jump_address __visible; unsigned long jump_address_phys; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ unsigned long restore_cr3 __visible; unsigned long temp_pgt __visible; unsigned long relocated_restore_code __visible; /** * pfn_is_nosave - check if given pfn is in the 'nosave' section * @pfn: the page frame number to check. */ int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn; unsigned long nosave_end_pfn; nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; } struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; unsigned long e820_checksum; }; /** * compute_e820_crc32 - calculate crc32 of a given e820 table * * @table: the e820 table to be calculated * * Return: the resulting checksum */ static inline u32 compute_e820_crc32(struct e820_table *table) { int size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; return ~crc32_le(~0, (unsigned char const *)table, size); } #ifdef CONFIG_X86_64 #define RESTORE_MAGIC 0x23456789ABCDEF02UL #else #define RESTORE_MAGIC 0x12345679UL #endif /** * arch_hibernation_header_save - populate the architecture specific part * of a hibernation image header * @addr: address where architecture specific header data will be saved. * @max_size: maximum size of architecture specific data in hibernation header. * * Return: 0 on success, -EOVERFLOW if max_size is insufficient. */ int arch_hibernation_header_save(void *addr, unsigned int max_size) { struct restore_data_record *rdr = addr; if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; rdr->magic = RESTORE_MAGIC; rdr->jump_address = (unsigned long)restore_registers; rdr->jump_address_phys = __pa_symbol(restore_registers); /* * The restore code fixes up CR3 and CR4 in the following sequence: * * [in hibernation asm] * 1. CR3 <= temporary page tables * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) * 3. CR3 <= rdr->cr3 * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) * [in restore_processor_state()] * 5. CR4 <= saved CR4 * 6. CR3 <= saved CR3 * * Our mmu_cr4_features has CR4.PCIDE=0, and toggling * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so * rdr->cr3 needs to point to valid page tables but must not * have any of the PCID bits set. */ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; rdr->e820_checksum = compute_e820_crc32(e820_table_firmware); return 0; } /** * arch_hibernation_header_restore - read the architecture specific data * from the hibernation image header * @addr: address to read the data from */ int arch_hibernation_header_restore(void *addr) { struct restore_data_record *rdr = addr; if (rdr->magic != RESTORE_MAGIC) { pr_crit("Unrecognized hibernate image header format!\n"); return -EINVAL; } restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) { pr_crit("Hibernate inconsistent memory map detected!\n"); return -ENODEV; } return 0; } int relocate_restore_code(void) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) return -ENOMEM; __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code); if (p4d_leaf(*p4d)) { set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); goto out; } pud = pud_offset(p4d, relocated_restore_code); if (pud_leaf(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); goto out; } pmd = pmd_offset(pud, relocated_restore_code); if (pmd_leaf(*pmd)) { set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); goto out; } pte = pte_offset_kernel(pmd, relocated_restore_code); set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); out: __flush_tlb_all(); return 0; } int arch_resume_nosmt(void) { int ret = 0; /* * We reached this while coming out of hibernation. This means * that SMT siblings are sleeping in hlt, as mwait is not safe * against control transition during resume (see comment in * hibernate_resume_nonboot_cpu_disable()). * * If the resumed kernel has SMT disabled, we have to take all the * SMT siblings out of hlt, and offline them again so that they * end up in mwait proper. * * Called with hotplug disabled. */ cpu_hotplug_enable(); if (cpu_smt_control == CPU_SMT_DISABLED || cpu_smt_control == CPU_SMT_FORCE_DISABLED) { enum cpuhp_smt_control old = cpu_smt_control; ret = cpuhp_smt_enable(); if (ret) goto out; ret = cpuhp_smt_disable(old); if (ret) goto out; } out: cpu_hotplug_disable(); return ret; }
42 5 14 6 30 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_HUGETLB_H #define _ASM_GENERIC_HUGETLB_H #include <linux/swap.h> #include <linux/swapops.h> static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); } static inline unsigned long huge_pte_dirty(pte_t pte) { return pte_dirty(pte); } static inline pte_t huge_pte_mkwrite(pte_t pte) { return pte_mkwrite_novma(pte); } #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT static inline pte_t huge_pte_wrprotect(pte_t pte) { return pte_wrprotect(pte); } #endif static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); } static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) { return pte_modify(pte, newprot); } #ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { free_pgd_range(tlb, addr, end, floor, ceiling); } #endif #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return ptep_clear_flush(vma, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } #endif /* Please refer to comments above pte_none_mostly() for the usage */ #ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } #endif #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { return 0; } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_set_wrprotect(mm, addr, ptep); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { return ptep_set_access_flags(vma, addr, ptep, pte, dirty); } #endif #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED static inline bool gigantic_page_runtime_supported(void) { return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); } #endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ #endif /* _ASM_GENERIC_HUGETLB_H */
1987 1986 1390 1987 1986 1986 23216 23210 19162 11588 11582 21047 21030 18654 6220 2422 4746 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT | __GFP_NOWARN); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { char *buf; int size, used; }; #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ if (!_s->buf) { \ pr_err(fmt, ##args); \ } else { \ int ret, len = _s->size - _s->used; \ ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \ _s->used += min(ret, len); \ } \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n", dir->name, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n", dir->name, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = {}; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT | __GFP_NOWARN); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free);
8786 669 944 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif #ifndef topology_is_primary_thread static inline bool topology_is_primary_thread(unsigned int cpu) { /* * When disabling SMT, the primary thread of the SMT will remain * enabled/active. Architectures that have a special primary thread * (e.g. x86) need to override this function. Otherwise the first * thread in the SMT can be made the primary thread. * * The sibling cpumask of an offline CPU always contains the CPU * itself on architectures using the implementation of * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology. * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for * building their topology have to check whether to use this default * implementation or to override it. */ return cpu == cpumask_first(topology_sibling_cpumask(cpu)); } #define topology_is_primary_thread topology_is_primary_thread #endif static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_node_numadist() - iterate over nodes in increasing distance * order, starting from a given node * @node: the iteration variable and the starting node. * @unvisited: a nodemask to keep track of the unvisited nodes. * * This macro iterates over NUMA node IDs in increasing distance from the * starting @node and yields MAX_NUMNODES when all the nodes have been * visited. * * Note that by the time the loop completes, the @unvisited nodemask will * be fully cleared, unless the loop exits early. * * The difference between for_each_node() and for_each_node_numadist() is * that the former allows to iterate over nodes in numerical order, whereas * the latter iterates over nodes in increasing order of distance. * * This complexity of this iterator is O(N^2), where N represents the * number of nodes, as each iteration involves scanning all nodes to * find the one with the shortest distance. * * Requires rcu_lock to be held. */ #define for_each_node_numadist(node, unvisited) \ for (int __start = (node), \ (node) = nearest_node_nodemask((__start), &(unvisited)); \ (node) < MAX_NUMNODES; \ node_clear((node), (unvisited)), \ (node) = nearest_node_nodemask((__start), &(unvisited))) /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) DECLARE_PER_CPU(unsigned long, cpu_scale); static inline unsigned long topology_get_cpu_scale(int cpu) { return per_cpu(cpu_scale, cpu); } void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); #endif /* _LINUX_TOPOLOGY_H */
4375 98 4435 5 3602 3793 805 4372 113 1 1 1 805 611 180 4375 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; #define BLK_MQ_CPU_WORK_BATCH (8) typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { /* Fast path: hardware queue is not stopped most of the time. */ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return false; /* * This barrier is used to order adding of dispatch list before and * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier * in blk_mq_start_stopped_hw_queue() so that dispatch code could * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not * empty to avoid missing dispatching requests. */ smp_mb(); return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_add(val, &hctx->nr_active); } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_add_active_requests(hctx, val); } static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_inc_active_requests(hctx); } static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_sub_active_requests(hctx, val); } static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_dec_active_requests(hctx); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ static inline bool blk_mq_can_poll(struct request_queue *q) { return (q->limits.features & BLK_FEAT_POLL) && q->tag_set->map[HCTX_TYPE_POLL].nr_queues; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 /* SPDX-License-Identifier: GPL-2.0 */ /* Interface for implementing AF_XDP zero-copy support in drivers. * Copyright(c) 2020 Intel Corporation. */ #ifndef _LINUX_XDP_SOCK_DRV_H #define _LINUX_XDP_SOCK_DRV_H #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) struct xsk_cb_desc { void *src; u8 off; u8 bytes; }; #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool); static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return XDP_PACKET_HEADROOM + pool->headroom; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { xp_set_rxq_info(pool, rxq); } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { xp_fill_cb(pool, desc); } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { xp_dma_unmap(pool, attrs); } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { struct xdp_umem *umem = pool->umem; return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs); } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_dma(xskb); } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); return xp_get_frame_dma(xskb); } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return xp_alloc(pool); } static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } /* Returns as many entries as possible up to max. 0 <= N <= max. */ static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return xp_alloc_batch(pool, xdp, max); } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return xp_can_alloc(pool, count); } static inline void xsk_buff_free(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); struct list_head *xskb_list = &xskb->pool->xskb_list; struct xdp_buff_xsk *pos, *tmp; if (likely(!xdp_buff_has_frags(xdp))) goto out; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { list_del(&pos->list_node); xp_free(pos); } xdp_get_shared_info_from_buff(xdp)->nr_frags = 0; out: xp_free(xskb); } static inline bool xsk_buff_add_frag(struct xdp_buff *head, struct xdp_buff *xdp) { const void *data = xdp->data; struct xdp_buff_xsk *frag; if (!__xdp_buff_add_frag(head, virt_to_netmem(data), offset_in_page(data), xdp->data_end - data, xdp->frame_sz, false)) return false; frag = container_of(xdp, struct xdp_buff_xsk, xdp); list_add_tail(&frag->list_node, &frag->pool->xskb_list); return true; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); if (frag) { list_del(&frag->list_node); ret = &frag->xdp; } return ret; } static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); list_del(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); return &frag->xdp; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + size; xdp->flags = 0; } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_dma(pool, addr); } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_data(pool, addr); } /** * xsk_buff_raw_get_ctx - get &xdp_desc context * @pool: XSk buff pool desc address belongs to * @addr: desc address (from userspace) * * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for * details. * * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata * pointer, if it is present and valid (initialized to %NULL otherwise). */ static inline struct xdp_desc_ctx xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) { return xp_raw_get_ctx(pool, addr); } #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ XDP_TXMD_FLAGS_LAUNCH_TIME | \ 0) static inline bool xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } static inline struct xsk_tx_metadata * __xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) { struct xsk_tx_metadata *meta; if (!pool->tx_metadata_len) return NULL; meta = data - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return NULL; /* no way to signal the error to the user */ return meta; } static inline struct xsk_tx_metadata * xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); xp_dma_sync_for_cpu(xskb); } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { xp_dma_sync_for_device(pool, dma, size); } #else static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { } static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return false; } static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max) { return 0; } static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } static inline struct xsk_buff_pool * xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { } static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { } static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return false; } static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return 0; } static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return 0; } static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { } static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc) { } static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { } static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) { return 0; } static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) { return 0; } static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return NULL; } static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } static inline u32 xsk_buff_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) { return 0; } static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; } static inline void xsk_buff_free(struct xdp_buff *xdp) { } static inline bool xsk_buff_add_frag(struct xdp_buff *head, struct xdp_buff *xdp) { return false; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { return NULL; } static inline void xsk_buff_del_tail(struct xdp_buff *tail) { } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { return NULL; } static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { } static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) { return 0; } static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline struct xdp_desc_ctx xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) { return (struct xdp_desc_ctx){ }; } static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; } static inline struct xsk_tx_metadata * __xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) { return NULL; } static inline struct xsk_tx_metadata * xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { } static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { } #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_DRV_H */
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match FRAG parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_frag.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 fragment match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the id is matched by the range, 0 otherwise */ static inline bool id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { bool r; pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, id, max); r = (id >= min && id <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } static bool frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct frag_hdr _frag; const struct frag_hdr *fh; const struct ip6t_frag *fraginfo = par->matchinfo; unsigned int ptr = 0; int err; err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); if (fh == NULL) { par->hotdrop = true; return false; } pr_debug("INFO %04X ", fh->frag_off); pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7); pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6); pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF)); pr_debug("ID %u %08X\n", ntohl(fh->identification), ntohl(fh->identification)); pr_debug("IPv6 FRAG id %02X ", id_match(fraginfo->ids[0], fraginfo->ids[1], ntohl(fh->identification), !!(fraginfo->invflags & IP6T_FRAG_INV_IDS))); pr_debug("res %02X %02X%04X %02X ", fraginfo->flags & IP6T_FRAG_RES, fh->reserved, ntohs(fh->frag_off) & 0x6, !((fraginfo->flags & IP6T_FRAG_RES) && (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); pr_debug("first %02X %02X %02X ", fraginfo->flags & IP6T_FRAG_FST, ntohs(fh->frag_off) & ~0x7, !((fraginfo->flags & IP6T_FRAG_FST) && (ntohs(fh->frag_off) & ~0x7))); pr_debug("mf %02X %02X %02X ", fraginfo->flags & IP6T_FRAG_MF, ntohs(fh->frag_off) & IP6_MF, !((fraginfo->flags & IP6T_FRAG_MF) && !((ntohs(fh->frag_off) & IP6_MF)))); pr_debug("last %02X %02X %02X\n", fraginfo->flags & IP6T_FRAG_NMF, ntohs(fh->frag_off) & IP6_MF, !((fraginfo->flags & IP6T_FRAG_NMF) && (ntohs(fh->frag_off) & IP6_MF))); return id_match(fraginfo->ids[0], fraginfo->ids[1], ntohl(fh->identification), !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) && !((fraginfo->flags & IP6T_FRAG_RES) && (fh->reserved || (ntohs(fh->frag_off) & 0x6))) && !((fraginfo->flags & IP6T_FRAG_FST) && (ntohs(fh->frag_off) & ~0x7)) && !((fraginfo->flags & IP6T_FRAG_MF) && !(ntohs(fh->frag_off) & IP6_MF)) && !((fraginfo->flags & IP6T_FRAG_NMF) && (ntohs(fh->frag_off) & IP6_MF)); } static int frag_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_frag *fraginfo = par->matchinfo; if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { pr_debug("unknown flags %X\n", fraginfo->invflags); return -EINVAL; } return 0; } static struct xt_match frag_mt6_reg __read_mostly = { .name = "frag", .family = NFPROTO_IPV6, .match = frag_mt6, .matchsize = sizeof(struct ip6t_frag), .checkentry = frag_mt6_check, .me = THIS_MODULE, }; static int __init frag_mt6_init(void) { return xt_register_match(&frag_mt6_reg); } static void __exit frag_mt6_exit(void) { xt_unregister_match(&frag_mt6_reg); } module_init(frag_mt6_init); module_exit(frag_mt6_exit);
3 22 1 18 3 21 1 10 11 10 10 9 8 8 1 7 1 1 6 9 21 15 19 12 6 19 1 8 17 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct nlattr *bc; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
15 35 17 54 40 9 13 47 47 23 23 17 6 5 14 14 14 8 6 14 8 4017 3951 216 121 20 5 5 26 1 25 25 5 14 14 12 20 5 25 29 3 1 24 1 1 4 20 10 11 2 20 4 5 24 2 3 1 2 83 1 1 1 13 13 21 14 9 42 4 11 15 17 30 32 13 1 4 1 2 1 2 3 1 2 1 2 1 2 1 1 1 2 1 1 1 1 2 11 1 1 1 2 1 1 1 1 1 2 2 5 1 1 9 7 4 10 25 1 1 1 19 3 2 22 20 1 10 10 10 10 12 18 1 4 9 8 14 1 14 3 11 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* raw.c - Raw sockets for protocol family CAN * * Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */ #include <linux/can/skb.h> #include <linux/can/raw.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN raw protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); MODULE_ALIAS("can-proto-1"); #define RAW_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex) #define MASK_ALL 0 /* A raw socket has a list of can_filters attached to it, each receiving * the CAN frames matching that filter. If the filter list is empty, * no CAN frames will be received by the socket. The default after * opening the socket, is to have one filter which receives all frames. * The filter list is allocated dynamically with the exception of the * list containing only one item. This common case is optimized by * storing the single filter in dfilter, to avoid using dynamic memory. */ struct uniqframe { int skbcnt; const struct sk_buff *skb; unsigned int join_rx_count; }; struct raw_sock { struct sock sk; int bound; int ifindex; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head notifier; int loopback; int recv_own_msgs; int fd_frames; int xl_frames; struct can_raw_vcid_options raw_vcid_opts; canid_t tx_vcid_shifted; canid_t rx_vcid_shifted; canid_t rx_vcid_mask_shifted; int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ can_err_mask_t err_mask; struct uniqframe __percpu *uniq; }; static LIST_HEAD(raw_notifier_list); static DEFINE_SPINLOCK(raw_notifier_lock); static struct raw_sock *raw_busy_notifier; /* Return pointer to store the extra msg flags for raw_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. */ static inline unsigned int *raw_flags(struct sk_buff *skb) { sock_skb_cb_check_size(sizeof(struct sockaddr_can) + sizeof(unsigned int)); /* return pointer after struct sockaddr_can */ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); } static inline struct raw_sock *raw_sk(const struct sock *sk) { return (struct raw_sock *)sk; } static void raw_rcv(struct sk_buff *oskb, void *data) { struct sock *sk = (struct sock *)data; struct raw_sock *ro = raw_sk(sk); struct sockaddr_can *addr; struct sk_buff *skb; unsigned int *pflags; /* check the received tx sock reference */ if (!ro->recv_own_msgs && oskb->sk == sk) return; /* make sure to not pass oversized frames to the socket */ if (!ro->fd_frames && can_is_canfd_skb(oskb)) return; if (can_is_canxl_skb(oskb)) { struct canxl_frame *cxl = (struct canxl_frame *)oskb->data; /* make sure to not pass oversized frames to the socket */ if (!ro->xl_frames) return; /* filter CAN XL VCID content */ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) { /* apply VCID filter if user enabled the filter */ if ((cxl->prio & ro->rx_vcid_mask_shifted) != (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted)) return; } else { /* no filter => do not forward VCID tagged frames */ if (cxl->prio & CANXL_VCID_MASK) return; } } /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { if (!ro->join_filters) return; this_cpu_inc(ro->uniq->join_rx_count); /* drop frame until all enabled filters matched */ if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) return; } else { this_cpu_ptr(ro->uniq)->skb = oskb; this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt; this_cpu_ptr(ro->uniq)->join_rx_count = 1; /* drop first frame to check all enabled filters? */ if (ro->join_filters && ro->count > 1) return; } /* clone the given skb to be able to enqueue it into the rcv queue */ skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) return; /* Put the datagram to the queue so that raw_recvmsg() can get * it from there. We need to pass the interface index to * raw_recvmsg(). We pass a whole struct sockaddr_can in * skb->cb containing the interface index. */ sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; /* add CAN specific message flags for raw_recvmsg() */ pflags = raw_flags(skb); *pflags = 0; if (oskb->sk) *pflags |= MSG_DONTROUTE; if (oskb->sk == sk) *pflags |= MSG_CONFIRM; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static int raw_enable_filters(struct net *net, struct net_device *dev, struct sock *sk, struct can_filter *filter, int count) { int err = 0; int i; for (i = 0; i < count; i++) { err = can_rx_register(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk, "raw", sk); if (err) { /* clean up successfully registered filters */ while (--i >= 0) can_rx_unregister(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk); break; } } return err; } static int raw_enable_errfilter(struct net *net, struct net_device *dev, struct sock *sk, can_err_mask_t err_mask) { int err = 0; if (err_mask) err = can_rx_register(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk, "raw", sk); return err; } static void raw_disable_filters(struct net *net, struct net_device *dev, struct sock *sk, struct can_filter *filter, int count) { int i; for (i = 0; i < count; i++) can_rx_unregister(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk); } static inline void raw_disable_errfilter(struct net *net, struct net_device *dev, struct sock *sk, can_err_mask_t err_mask) { if (err_mask) can_rx_unregister(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk); } static inline void raw_disable_allfilters(struct net *net, struct net_device *dev, struct sock *sk) { struct raw_sock *ro = raw_sk(sk); raw_disable_filters(net, dev, sk, ro->filter, ro->count); raw_disable_errfilter(net, dev, sk, ro->err_mask); } static int raw_enable_allfilters(struct net *net, struct net_device *dev, struct sock *sk) { struct raw_sock *ro = raw_sk(sk); int err; err = raw_enable_filters(net, dev, sk, ro->filter, ro->count); if (!err) { err = raw_enable_errfilter(net, dev, sk, ro->err_mask); if (err) raw_disable_filters(net, dev, sk, ro->filter, ro->count); } return err; } static void raw_notify(struct raw_sock *ro, unsigned long msg, struct net_device *dev) { struct sock *sk = &ro->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (ro->dev != dev) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (ro->bound) { raw_disable_allfilters(dev_net(dev), dev, sk); netdev_put(dev, &ro->dev_tracker); } if (ro->count > 1) kfree(ro->filter); ro->ifindex = 0; ro->bound = 0; ro->dev = NULL; ro->count = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int raw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&raw_notifier_lock); list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) { spin_unlock(&raw_notifier_lock); raw_notify(raw_busy_notifier, msg, dev); spin_lock(&raw_notifier_lock); } raw_busy_notifier = NULL; spin_unlock(&raw_notifier_lock); return NOTIFY_DONE; } static int raw_init(struct sock *sk) { struct raw_sock *ro = raw_sk(sk); ro->bound = 0; ro->ifindex = 0; ro->dev = NULL; /* set default filter to single entry dfilter */ ro->dfilter.can_id = 0; ro->dfilter.can_mask = MASK_ALL; ro->filter = &ro->dfilter; ro->count = 1; /* set default loopback behaviour */ ro->loopback = 1; ro->recv_own_msgs = 0; ro->fd_frames = 0; ro->xl_frames = 0; ro->join_filters = 0; /* alloc_percpu provides zero'ed memory */ ro->uniq = alloc_percpu(struct uniqframe); if (unlikely(!ro->uniq)) return -ENOMEM; /* set notifier */ spin_lock(&raw_notifier_lock); list_add_tail(&ro->notifier, &raw_notifier_list); spin_unlock(&raw_notifier_lock); return 0; } static int raw_release(struct socket *sock) { struct sock *sk = sock->sk; struct raw_sock *ro; struct net *net; if (!sk) return 0; ro = raw_sk(sk); net = sock_net(sk); spin_lock(&raw_notifier_lock); while (raw_busy_notifier == ro) { spin_unlock(&raw_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&raw_notifier_lock); } list_del(&ro->notifier); spin_unlock(&raw_notifier_lock); rtnl_lock(); lock_sock(sk); /* remove current filters & unregister */ if (ro->bound) { if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); netdev_put(ro->dev, &ro->dev_tracker); } else { raw_disable_allfilters(net, NULL, sk); } } if (ro->count > 1) kfree(ro->filter); ro->ifindex = 0; ro->bound = 0; ro->dev = NULL; ro->count = 0; free_percpu(ro->uniq); sock_orphan(sk); sock->sk = NULL; release_sock(sk); rtnl_unlock(); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; } static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct net_device *dev = NULL; int ifindex; int err = 0; int notify_enetdown = 0; if (len < RAW_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; rtnl_lock(); lock_sock(sk); if (ro->bound && addr->can_ifindex == ro->ifindex) goto out; if (addr->can_ifindex) { dev = dev_get_by_index(sock_net(sk), addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { err = -ENODEV; goto out_put_dev; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; /* filters set by default/setsockopt */ err = raw_enable_allfilters(sock_net(sk), dev, sk); if (err) goto out_put_dev; } else { ifindex = 0; /* filters set by default/setsockopt */ err = raw_enable_allfilters(sock_net(sk), NULL, sk); } if (!err) { if (ro->bound) { /* unregister old filters */ if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); /* drop reference to old ro->dev */ netdev_put(ro->dev, &ro->dev_tracker); } else { raw_disable_allfilters(sock_net(sk), NULL, sk); } } ro->ifindex = ifindex; ro->bound = 1; /* bind() ok -> hold a reference for new ro->dev */ ro->dev = dev; if (ro->dev) netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL); } out_put_dev: /* remove potential reference from dev_get_by_index() */ dev_put(dev); out: release_sock(sk); rtnl_unlock(); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int raw_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, RAW_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = ro->ifindex; return RAW_MIN_NAMELEN; } static int raw_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct can_filter *filter = NULL; /* dyn. alloc'ed filters */ struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; int fd_frames; int count = 0; int err = 0; if (level != SOL_CAN_RAW) return -EINVAL; switch (optname) { case CAN_RAW_FILTER: if (optlen % sizeof(struct can_filter) != 0) return -EINVAL; if (optlen > CAN_RAW_FILTER_MAX * sizeof(struct can_filter)) return -EINVAL; count = optlen / sizeof(struct can_filter); if (count > 1) { /* filter does not fit into dfilter => alloc space */ filter = memdup_sockptr(optval, optlen); if (IS_ERR(filter)) return PTR_ERR(filter); } else if (count == 1) { if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter))) return -EFAULT; } rtnl_lock(); lock_sock(sk); dev = ro->dev; if (ro->bound && dev) { if (dev->reg_state != NETREG_REGISTERED) { if (count > 1) kfree(filter); err = -ENODEV; goto out_fil; } } if (ro->bound) { /* (try to) register the new filters */ if (count == 1) err = raw_enable_filters(sock_net(sk), dev, sk, &sfilter, 1); else err = raw_enable_filters(sock_net(sk), dev, sk, filter, count); if (err) { if (count > 1) kfree(filter); goto out_fil; } /* remove old filter registrations */ raw_disable_filters(sock_net(sk), dev, sk, ro->filter, ro->count); } /* remove old filter space */ if (ro->count > 1) kfree(ro->filter); /* link new filters to the socket */ if (count == 1) { /* copy filter data for single filter */ ro->dfilter = sfilter; filter = &ro->dfilter; } ro->filter = filter; ro->count = count; out_fil: release_sock(sk); rtnl_unlock(); break; case CAN_RAW_ERR_FILTER: if (optlen != sizeof(err_mask)) return -EINVAL; if (copy_from_sockptr(&err_mask, optval, optlen)) return -EFAULT; err_mask &= CAN_ERR_MASK; rtnl_lock(); lock_sock(sk); dev = ro->dev; if (ro->bound && dev) { if (dev->reg_state != NETREG_REGISTERED) { err = -ENODEV; goto out_err; } } /* remove current error mask */ if (ro->bound) { /* (try to) register the new err_mask */ err = raw_enable_errfilter(sock_net(sk), dev, sk, err_mask); if (err) goto out_err; /* remove old err_mask registration */ raw_disable_errfilter(sock_net(sk), dev, sk, ro->err_mask); } /* link new err_mask to the socket */ ro->err_mask = err_mask; out_err: release_sock(sk); rtnl_unlock(); break; case CAN_RAW_LOOPBACK: if (optlen != sizeof(ro->loopback)) return -EINVAL; if (copy_from_sockptr(&ro->loopback, optval, optlen)) return -EFAULT; break; case CAN_RAW_RECV_OWN_MSGS: if (optlen != sizeof(ro->recv_own_msgs)) return -EINVAL; if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) return -EFAULT; break; case CAN_RAW_FD_FRAMES: if (optlen != sizeof(fd_frames)) return -EINVAL; if (copy_from_sockptr(&fd_frames, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames && !fd_frames) return -EINVAL; ro->fd_frames = fd_frames; break; case CAN_RAW_XL_FRAMES: if (optlen != sizeof(ro->xl_frames)) return -EINVAL; if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames) ro->fd_frames = ro->xl_frames; break; case CAN_RAW_XL_VCID_OPTS: if (optlen != sizeof(ro->raw_vcid_opts)) return -EINVAL; if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen)) return -EFAULT; /* prepare 32 bit values for handling in hot path */ ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET; ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET; ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET; break; case CAN_RAW_JOIN_FILTERS: if (optlen != sizeof(ro->join_filters)) return -EINVAL; if (copy_from_sockptr(&ro->join_filters, optval, optlen)) return -EFAULT; break; default: return -ENOPROTOOPT; } return err; } static int raw_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); int len; void *val; if (level != SOL_CAN_RAW) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_RAW_FILTER: { int err = 0; lock_sock(sk); if (ro->count > 0) { int fsize = ro->count * sizeof(struct can_filter); /* user space buffer to small for filter list? */ if (len < fsize) { /* return -ERANGE and needed space in optlen */ err = -ERANGE; if (put_user(fsize, optlen)) err = -EFAULT; } else { if (len > fsize) len = fsize; if (copy_to_user(optval, ro->filter, len)) err = -EFAULT; } } else { len = 0; } release_sock(sk); if (!err) err = put_user(len, optlen); return err; } case CAN_RAW_ERR_FILTER: if (len > sizeof(can_err_mask_t)) len = sizeof(can_err_mask_t); val = &ro->err_mask; break; case CAN_RAW_LOOPBACK: if (len > sizeof(int)) len = sizeof(int); val = &ro->loopback; break; case CAN_RAW_RECV_OWN_MSGS: if (len > sizeof(int)) len = sizeof(int); val = &ro->recv_own_msgs; break; case CAN_RAW_FD_FRAMES: if (len > sizeof(int)) len = sizeof(int); val = &ro->fd_frames; break; case CAN_RAW_XL_FRAMES: if (len > sizeof(int)) len = sizeof(int); val = &ro->xl_frames; break; case CAN_RAW_XL_VCID_OPTS: { int err = 0; /* user space buffer to small for VCID opts? */ if (len < sizeof(ro->raw_vcid_opts)) { /* return -ERANGE and needed space in optlen */ err = -ERANGE; if (put_user(sizeof(ro->raw_vcid_opts), optlen)) err = -EFAULT; } else { if (len > sizeof(ro->raw_vcid_opts)) len = sizeof(ro->raw_vcid_opts); if (copy_to_user(optval, &ro->raw_vcid_opts, len)) err = -EFAULT; } if (!err) err = put_user(len, optlen); return err; } case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); val = &ro->join_filters; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) { struct canxl_frame *cxl = (struct canxl_frame *)skb->data; /* sanitize non CAN XL bits */ cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK); /* clear VCID in CAN XL frame if pass through is disabled */ if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS)) cxl->prio &= CANXL_PRIO_MASK; /* set VCID in CAN XL frame if enabled */ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) { cxl->prio &= CANXL_PRIO_MASK; cxl->prio |= ro->tx_vcid_shifted; } } static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) { /* Classical CAN -> no checks for flags and device capabilities */ if (can_is_can_skb(skb)) return CAN_MTU; /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ if (ro->fd_frames && can_is_canfd_skb(skb) && (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) return CANFD_MTU; /* CAN XL -> needs to be enabled and a CAN XL device */ if (ro->xl_frames && can_is_canxl_skb(skb) && can_is_canxl_dev_mtu(mtu)) return CANXL_MTU; return 0; } static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; unsigned int txmtu; int ifindex; int err = -EINVAL; /* check for valid CAN frame sizes */ if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU) return -EINVAL; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < RAW_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; ifindex = addr->can_ifindex; } else { ifindex = ro->ifindex; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENXIO; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto put_dev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; /* fill the skb before testing for valid CAN frames */ err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto free_skb; err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ txmtu = raw_check_txframe(ro, skb, dev->mtu); if (!txmtu) goto free_skb; /* only CANXL: clear/forward/set VCID value */ if (txmtu == CANXL_MTU) raw_put_canxl_vcid(ro, skb); sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto free_skb; } skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); err = can_send(skb, ro->loopback); dev_put(dev); if (err) goto send_failed; return size; free_skb: kfree_skb(skb); put_dev: dev_put(dev); send_failed: return err; } static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = 0; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sk, msg, size, SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(RAW_MIN_NAMELEN); msg->msg_namelen = RAW_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* assign the flags that have been recorded in raw_rcv() */ msg->msg_flags |= *(raw_flags(skb)); skb_free_datagram(sk, skb); return size; } static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops raw_ops = { .family = PF_CAN, .release = raw_release, .bind = raw_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, .poll = datagram_poll, .ioctl = raw_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .mmap = sock_no_mmap, }; static struct proto raw_proto __read_mostly = { .name = "CAN_RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct raw_sock), .init = raw_init, }; static const struct can_proto raw_can_proto = { .type = SOCK_RAW, .protocol = CAN_RAW, .ops = &raw_ops, .prot = &raw_proto, }; static struct notifier_block canraw_notifier = { .notifier_call = raw_notifier }; static __init int raw_module_init(void) { int err; pr_info("can: raw protocol\n"); err = register_netdevice_notifier(&canraw_notifier); if (err) return err; err = can_proto_register(&raw_can_proto); if (err < 0) { pr_err("can: registration of raw protocol failed\n"); goto register_proto_failed; } return 0; register_proto_failed: unregister_netdevice_notifier(&canraw_notifier); return err; } static __exit void raw_module_exit(void) { can_proto_unregister(&raw_can_proto); unregister_netdevice_notifier(&canraw_notifier); } module_init(raw_module_init); module_exit(raw_module_exit);
114 1003 84 1003 245 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Skb ref helpers. * */ #ifndef _LINUX_SKBUFF_REF_H #define _LINUX_SKBUFF_REF_H #include <linux/skbuff.h> /** * __skb_frag_ref - take an addition reference on a paged fragment. * @frag: the paged fragment * * Takes an additional reference on the paged fragment @frag. */ static inline void __skb_frag_ref(skb_frag_t *frag) { get_netmem(skb_frag_netmem(frag)); } /** * skb_frag_ref - take an addition reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset. * * Takes an additional reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_ref(struct sk_buff *skb, int f) { __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } bool napi_pp_put_page(netmem_ref netmem); static inline void skb_page_unref(netmem_ref netmem, bool recycle) { #ifdef CONFIG_PAGE_POOL if (recycle && napi_pp_put_page(netmem)) return; #endif put_netmem(netmem); } /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment * @recycle: recycle the page if allocated via page_pool * * Releases a reference on the paged fragment @frag * or recycles the page via the page_pool API. */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { skb_page_unref(skb_frag_netmem(frag), recycle); } /** * skb_frag_unref - release a reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset * * Releases a reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (!skb_zcopy_managed(skb)) __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } #endif /* _LINUX_SKBUFF_REF_H */
1 54 52 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 #ifndef __LINUX_ERSPAN_H #define __LINUX_ERSPAN_H /* * GRE header for ERSPAN type I encapsulation (4 octets [34:37]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |0|0|0|0|0|00000|000000000|00000| Protocol Type for ERSPAN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * The Type I ERSPAN frame format is based on the barebones IP + GRE * encapsulation (as described above) on top of the raw mirrored frame. * There is no extra ERSPAN header. * * * GRE header for ERSPAN type II and II encapsulation (8 octets [34:41]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |0|0|0|1|0|00000|000000000|00000| Protocol Type for ERSPAN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sequence Number (increments per packet per session) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note that in the above GRE header [RFC1701] out of the C, R, K, S, * s, Recur, Flags, Version fields only S (bit 03) is set to 1. The * other fields are set to zero, so only a sequence number follows. * * ERSPAN Version 1 (Type II) header (8 octets [42:49]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ver | VLAN | COS | En|T| Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Reserved | Index | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * ERSPAN Version 2 (Type III) header (12 octets [42:49]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ver | VLAN | COS |BSO|T| Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Timestamp | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | SGT |P| FT | Hw ID |D|Gra|O| * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Platform Specific SubHeader (8 octets, optional) * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Platf ID | Platform Specific Info | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Platform Specific Info | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * GRE proto ERSPAN type I/II = 0x88BE, type III = 0x22EB */ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/skbuff.h> #include <uapi/linux/erspan.h> #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff #define COS_MASK 0xe000 #define EN_MASK 0x1800 #define T_MASK 0x0400 #define ID_MASK 0x03ff #define INDEX_MASK 0xfffff #define ERSPAN_VERSION2 0x2 /* ERSPAN type III*/ #define BSO_MASK EN_MASK #define SGT_MASK 0xffff0000 #define P_MASK 0x8000 #define FT_MASK 0x7c00 #define HWID_MASK 0x03f0 #define DIR_MASK 0x0008 #define GRA_MASK 0x0006 #define O_MASK 0x0001 #define HWID_OFFSET 4 #define DIR_OFFSET 3 enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ ERSPAN_ENCAP_8021Q = 0x2, /* originally 802.1Q encapsulated */ ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag preserved in frame */ }; #define ERSPAN_V1_MDSIZE 4 #define ERSPAN_V2_MDSIZE 8 struct erspan_base_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 vlan_upper:4, ver:4; __u8 vlan:8; __u8 session_id_upper:2, t:1, en:2, cos:3; __u8 session_id:8; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 ver: 4, vlan_upper:4; __u8 vlan:8; __u8 cos:3, en:2, t:1, session_id_upper:2; __u8 session_id:8; #else #error "Please fix <asm/byteorder.h>" #endif }; static inline void set_session_id(struct erspan_base_hdr *ershdr, u16 id) { ershdr->session_id = id & 0xff; ershdr->session_id_upper = (id >> 8) & 0x3; } static inline u16 get_session_id(const struct erspan_base_hdr *ershdr) { return (ershdr->session_id_upper << 8) + ershdr->session_id; } static inline void set_vlan(struct erspan_base_hdr *ershdr, u16 vlan) { ershdr->vlan = vlan & 0xff; ershdr->vlan_upper = (vlan >> 8) & 0xf; } static inline u16 get_vlan(const struct erspan_base_hdr *ershdr) { return (ershdr->vlan_upper << 8) + ershdr->vlan; } static inline void set_hwid(struct erspan_md2 *md2, u8 hwid) { md2->hwid = hwid & 0xf; md2->hwid_upper = (hwid >> 4) & 0x3; } static inline u8 get_hwid(const struct erspan_md2 *md2) { return (md2->hwid_upper << 4) + md2->hwid; } static inline int erspan_hdr_len(int version) { if (version == 0) return 0; return sizeof(struct erspan_base_hdr) + (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); } static inline u8 tos_to_cos(u8 tos) { u8 dscp, cos; dscp = tos >> 2; cos = dscp >> 3; return cos; } static inline void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; enum erspan_encap_type enc_type; struct erspan_base_hdr *ershdr; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 tos; __be32 *idx; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + (ipv6_hdr(skb)->flow_lbl[0] >> 4); enc_type = ERSPAN_ENCAP_NOVLAN; /* If mirrored packet has vlan tag, extract tci and * preserve vlan header in the mirrored frame. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); vlan_tci = ntohs(qp->tci); enc_type = ERSPAN_ENCAP_INFRAME; } skb_push(skb, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); /* Build base header */ ershdr->ver = ERSPAN_VERSION; ershdr->cos = tos_to_cos(tos); ershdr->en = enc_type; ershdr->t = truncate; set_vlan(ershdr, vlan_tci); set_session_id(ershdr, id); /* Build metadata */ idx = (__be32 *)(ershdr + 1); *idx = htonl(index & INDEX_MASK); } /* ERSPAN GRA: timestamp granularity * 00b --> granularity = 100 microseconds * 01b --> granularity = 100 nanoseconds * 10b --> granularity = IEEE 1588 * Here we only support 100 microseconds. */ static inline __be32 erspan_get_timestamp(void) { u64 h_usecs; ktime_t kt; kt = ktime_get_real(); h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC); /* ERSPAN base header only has 32-bit, * so it wraps around 4 days. */ return htonl((u32)h_usecs); } /* ERSPAN BSO (Bad/Short/Oversized), see RFC1757 * 00b --> Good frame with no error, or unknown integrity * 01b --> Payload is a Short Frame * 10b --> Payload is an Oversized Frame * 11b --> Payload is a Bad Frame with CRC or Alignment Error */ enum erspan_bso { BSO_NOERROR = 0x0, BSO_SHORT = 0x1, BSO_OVERSIZED = 0x2, BSO_BAD = 0x3, }; static inline u8 erspan_detect_bso(struct sk_buff *skb) { /* BSO_BAD is not handled because the frame CRC * or alignment error information is in FCS. */ if (skb->len < ETH_ZLEN) return BSO_SHORT; if (skb->len > ETH_FRAME_LEN) return BSO_OVERSIZED; return BSO_NOERROR; } static inline void erspan_build_header_v2(struct sk_buff *skb, u32 id, u8 direction, u16 hwid, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; struct erspan_base_hdr *ershdr; struct erspan_md2 *md2; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 gra = 0; /* 100 usec */ u8 bso = 0; /* Bad/Short/Oversized */ u8 sgt = 0; u8 tos; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + (ipv6_hdr(skb)->flow_lbl[0] >> 4); /* Unlike v1, v2 does not have En field, * so only extract vlan tci field. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); vlan_tci = ntohs(qp->tci); } bso = erspan_detect_bso(skb); skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); /* Build base header */ ershdr->ver = ERSPAN_VERSION2; ershdr->cos = tos_to_cos(tos); ershdr->en = bso; ershdr->t = truncate; set_vlan(ershdr, vlan_tci); set_session_id(ershdr, id); /* Build metadata */ md2 = (struct erspan_md2 *)(ershdr + 1); md2->timestamp = erspan_get_timestamp(); md2->sgt = htons(sgt); md2->p = 1; md2->ft = 0; md2->dir = direction; md2->gra = gra; md2->o = 0; set_hwid(md2, hwid); } #endif
7 1 6 2 2 1 1 1 1 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> #include <linux/icmp.h> #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); if (priv->type == NFT_REJECT_ICMPX_UNREACH && icmp_code > NFT_REJECT_ICMPX_MAX) return -EINVAL; priv->icmp_code = icmp_code; break; case NFT_REJECT_TCP_RST: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(nft_reject_init); int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) goto nla_put_failure; switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_reject_dump); static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, }; int nft_reject_icmp_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMP_NET_UNREACH; return icmp_code_v4[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmp_code); static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, }; int nft_reject_icmpv6_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMPV6_NOROUTE; return icmp_code_v6[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Netfilter x_tables over nftables module");
63 3 57 57 31 16 31 9 3 57 57 9 31 49 40 9 49 9 7 2 4 3 1 1 1 4 1 3 1 51 21 2 19 1 80 3 4 72 70 1 48 55 2 66 5 47 19 11 11 2 9 49 57 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 /* net/tipc/udp_media.c: IP bearer support for TIPC * * Copyright (c) 2015, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/list.h> #include <net/sock.h> #include <net/ip.h> #include <net/udp_tunnel.h> #include <net/ipv6_stubs.h> #include <linux/tipc_netlink.h> #include "core.h" #include "addr.h" #include "net.h" #include "bearer.h" #include "netlink.h" #include "msg.h" #include "udp_media.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 #define UDP_MIN_HEADROOM 48 /** * struct udp_media_addr - IP/UDP addressing information * * This is the bearer level originating address used in neighbor discovery * messages, and all fields should be in network byte order * * @proto: Ethernet protocol in use * @port: port being used * @ipv4: IPv4 address of neighbor * @ipv6: IPv6 address of neighbor */ struct udp_media_addr { __be16 proto; __be16 port; union { struct in_addr ipv4; struct in6_addr ipv6; }; }; /* struct udp_replicast - container for UDP remote addresses */ struct udp_replicast { struct udp_media_addr addr; struct dst_cache dst_cache; struct rcu_head rcu; struct list_head list; }; /** * struct udp_bearer - ip/udp bearer data structure * @bearer: associated generic tipc bearer * @ubsock: bearer associated socket * @ifindex: local address scope * @work: used to schedule deferred work on a bearer * @rcast: associated udp_replicast container */ struct udp_bearer { struct tipc_bearer __rcu *bearer; struct socket *ubsock; u32 ifindex; struct work_struct work; struct udp_replicast rcast; }; static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr) { if (ntohs(addr->proto) == ETH_P_IP) return ipv4_is_multicast(addr->ipv4.s_addr); #if IS_ENABLED(CONFIG_IPV6) else return ipv6_addr_is_multicast(&addr->ipv6); #endif return 0; } /* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, struct udp_media_addr *ua) { memset(addr, 0, sizeof(struct tipc_media_addr)); addr->media_id = TIPC_MEDIA_TYPE_UDP; memcpy(addr->value, ua, sizeof(struct udp_media_addr)); if (tipc_udp_is_mcast_addr(ua)) addr->broadcast = TIPC_BROADCAST_SUPPORT; } /* tipc_udp_addr2str - convert ip/udp address to string */ static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size) { struct udp_media_addr *ua = (struct udp_media_addr *)&a->value; if (ntohs(ua->proto) == ETH_P_IP) snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port)); else if (ntohs(ua->proto) == ETH_P_IPV6) snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port)); else { pr_err("Invalid UDP media address\n"); return 1; } return 0; } /* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */ static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a, char *msg) { struct udp_media_addr *ua; ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET); if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP) return -EINVAL; tipc_udp_media_addr_set(a, ua); return 0; } /* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value, sizeof(struct udp_media_addr)); return 0; } /* tipc_send_msg - enqueue a send request */ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_bearer *ub, struct udp_media_addr *src, struct udp_media_addr *dst, struct dst_cache *cache) { struct dst_entry *ndst; int ttl, err = 0; local_bh_disable(); ndst = dst_cache_get(cache); if (dst->proto == htons(ETH_P_IP)) { struct rtable *rt = dst_rtable(ndst); if (!rt) { struct flowi4 fl = { .daddr = dst->ipv4.s_addr, .saddr = src->ipv4.s_addr, .flowi4_mark = skb->mark, .flowi4_proto = IPPROTO_UDP }; rt = ip_route_output_key(net, &fl); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto tx_error; } dst_cache_set_ip4(cache, &rt->dst, fl.saddr); } ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, dst->port, false, true); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { struct flowi6 fl6 = { .flowi6_oif = ub->ifindex, .daddr = dst->ipv6, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; ndst = ipv6_stub->ipv6_dst_lookup_flow(net, ub->ubsock->sk, &fl6, NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); goto tx_error; } dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, src->port, dst->port, false); #endif } local_bh_enable(); return err; tx_error: local_bh_enable(); kfree_skb(skb); return err; } static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *addr) { struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value; struct udp_replicast *rcast; struct udp_bearer *ub; int err = 0; if (skb_headroom(skb) < UDP_MIN_HEADROOM) { err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); if (err) goto out; } skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); ub = rcu_dereference(b->media_ptr); if (!ub) { err = -ENODEV; goto out; } if (addr->broadcast != TIPC_REPLICAST_SUPPORT) return tipc_udp_xmit(net, skb, ub, src, dst, &ub->rcast.dst_cache); /* Replicast, send an skb to each configured IP address */ list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { struct sk_buff *_skb; _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) { err = -ENOMEM; goto out; } err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr, &rcast->dst_cache); if (err) goto out; } err = 0; out: kfree_skb(skb); return err; } static bool tipc_udp_is_known_peer(struct tipc_bearer *b, struct udp_media_addr *addr) { struct udp_replicast *rcast, *tmp; struct udp_bearer *ub; ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) { pr_err_ratelimited("UDP bearer instance not found\n"); return false; } list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr))) return true; } return false; } static int tipc_udp_rcast_add(struct tipc_bearer *b, struct udp_media_addr *addr) { struct udp_replicast *rcast; struct udp_bearer *ub; ub = rcu_dereference_rtnl(b->media_ptr); if (!ub) return -ENODEV; rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC); if (!rcast) return -ENOMEM; if (dst_cache_init(&rcast->dst_cache, GFP_ATOMIC)) { kfree(rcast); return -ENOMEM; } memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); if (ntohs(addr->proto) == ETH_P_IP) pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(addr->proto) == ETH_P_IPV6) pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); #endif b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT; list_add_rcu(&rcast->list, &ub->rcast.list); return 0; } static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb) { struct udp_media_addr src = {0}; struct udp_media_addr *dst; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (tipc_udp_is_mcast_addr(dst)) return 0; src.port = udp_hdr(skb)->source; if (ip_hdr(skb)->version == 4) { struct iphdr *iphdr = ip_hdr(skb); src.proto = htons(ETH_P_IP); src.ipv4.s_addr = iphdr->saddr; if (ipv4_is_multicast(iphdr->daddr)) return 0; #if IS_ENABLED(CONFIG_IPV6) } else if (ip_hdr(skb)->version == 6) { struct ipv6hdr *iphdr = ipv6_hdr(skb); src.proto = htons(ETH_P_IPV6); src.ipv6 = iphdr->saddr; if (ipv6_addr_is_multicast(&iphdr->daddr)) return 0; #endif } else { return 0; } if (likely(tipc_udp_is_known_peer(b, &src))) return 0; return tipc_udp_rcast_add(b, &src); } /* tipc_udp_recv - read data from bearer socket */ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { struct udp_bearer *ub; struct tipc_bearer *b; struct tipc_msg *hdr; int err; ub = rcu_dereference_sk_user_data(sk); if (!ub) { pr_err_ratelimited("Failed to get UDP bearer reference"); goto out; } skb_pull(skb, sizeof(struct udphdr)); hdr = buf_msg(skb); b = rcu_dereference(ub->bearer); if (!b) goto out; if (b && test_bit(0, &b->up)) { TIPC_SKB_CB(skb)->flags = 0; tipc_rcv(sock_net(sk), skb, b); return 0; } if (unlikely(msg_user(hdr) == LINK_CONFIG)) { err = tipc_udp_rcast_disc(b, skb); if (err) goto out; } out: kfree_skb(skb); return 0; } static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) { int err = 0; struct ip_mreqn mreqn; struct sock *sk = ub->ubsock->sk; if (ntohs(remote->proto) == ETH_P_IP) { mreqn.imr_multiaddr = remote->ipv4; mreqn.imr_ifindex = ub->ifindex; err = ip_mc_join_group(sk, &mreqn); #if IS_ENABLED(CONFIG_IPV6) } else { lock_sock(sk); err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); release_sock(sk); #endif } return err; } static int __tipc_nl_add_udp_addr(struct sk_buff *skb, struct udp_media_addr *addr, int nla_t) { if (ntohs(addr->proto) == ETH_P_IP) { struct sockaddr_in ip4; memset(&ip4, 0, sizeof(ip4)); ip4.sin_family = AF_INET; ip4.sin_port = addr->port; ip4.sin_addr.s_addr = addr->ipv4.s_addr; if (nla_put(skb, nla_t, sizeof(ip4), &ip4)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) } else if (ntohs(addr->proto) == ETH_P_IPV6) { struct sockaddr_in6 ip6; memset(&ip6, 0, sizeof(ip6)); ip6.sin6_family = AF_INET6; ip6.sin6_port = addr->port; memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr)); if (nla_put(skb, nla_t, sizeof(ip6), &ip6)) return -EMSGSIZE; #endif } return 0; } int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) { u32 bid = cb->args[0]; u32 skip_cnt = cb->args[1]; u32 portid = NETLINK_CB(cb->skb).portid; struct udp_replicast *rcast, *tmp; struct tipc_bearer *b; struct udp_bearer *ub; void *hdr; int err; int i; if (!bid && !skip_cnt) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct net *net = sock_net(skb->sk); struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; char *bname; if (!attrs[TIPC_NLA_BEARER]) return -EINVAL; err = nla_parse_nested_deprecated(battrs, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER], tipc_nl_bearer_policy, NULL); if (err) return err; if (!battrs[TIPC_NLA_BEARER_NAME]) return -EINVAL; bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]); rtnl_lock(); b = tipc_bearer_find(net, bname); if (!b) { rtnl_unlock(); return -EINVAL; } bid = b->identity; } else { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); rtnl_lock(); b = rtnl_dereference(tn->bearer_list[bid]); if (!b) { rtnl_unlock(); return -EINVAL; } } ub = rtnl_dereference(b->media_ptr); if (!ub) { rtnl_unlock(); return -EINVAL; } i = 0; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { if (i < skip_cnt) goto count; hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_BEARER_GET); if (!hdr) goto done; err = __tipc_nl_add_udp_addr(skb, &rcast->addr, TIPC_NLA_UDP_REMOTE); if (err) { genlmsg_cancel(skb, hdr); goto done; } genlmsg_end(skb, hdr); count: i++; } done: rtnl_unlock(); cb->args[0] = bid; cb->args[1] = i; return skb->len; } int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) { struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct udp_media_addr *dst; struct udp_bearer *ub; struct nlattr *nest; ub = rtnl_dereference(b->media_ptr); if (!ub) return -ENODEV; nest = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); if (!nest) goto msg_full; if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL)) goto msg_full; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE)) goto msg_full; if (!list_empty(&ub->rcast.list)) { if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP)) goto msg_full; } nla_nest_end(msg->skb, nest); return 0; msg_full: nla_nest_cancel(msg->skb, nest); return -EMSGSIZE; } /** * tipc_parse_udp_addr - build udp media address from netlink data * @nla: netlink attribute containing sockaddr storage aligned address * @addr: tipc media address to fill with address, port and protocol type * @scope_id: IPv6 scope id pointer, not NULL indicates it's required */ static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr, u32 *scope_id) { struct sockaddr_storage sa; nla_memcpy(&sa, nla, sizeof(sa)); if (sa.ss_family == AF_INET) { struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa; addr->proto = htons(ETH_P_IP); addr->port = ip4->sin_port; addr->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) } else if (sa.ss_family == AF_INET6) { struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa; addr->proto = htons(ETH_P_IPV6); addr->port = ip6->sin6_port; memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); /* Scope ID is only interesting for local addresses */ if (scope_id) { int atype; atype = ipv6_addr_type(&ip6->sin6_addr); if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) { return -EINVAL; } *scope_id = ip6->sin6_scope_id ? : 0; } return 0; #endif } return -EADDRNOTAVAIL; } int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) { int err; struct udp_media_addr addr = {0}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; struct udp_media_addr *dst; if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy, NULL)) return -EINVAL; if (!opts[TIPC_NLA_UDP_REMOTE]) return -EINVAL; err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL); if (err) return err; dst = (struct udp_media_addr *)&b->bcast_addr.value; if (tipc_udp_is_mcast_addr(dst)) { pr_err("Can't add remote ip to TIPC UDP multicast bearer\n"); return -EINVAL; } if (tipc_udp_is_known_peer(b, &addr)) return 0; return tipc_udp_rcast_add(b, &addr); } /** * tipc_udp_enable - callback to create a new udp bearer instance * @net: network namespace * @b: pointer to generic tipc_bearer * @attrs: netlink bearer configuration * * validate the bearer parameters and initialize the udp bearer * rtnl_lock should be held */ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]) { int err = -EINVAL; struct udp_bearer *ub; struct udp_media_addr remote = {0}; struct udp_media_addr local = {0}; struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; u8 node_id[NODE_ID_LEN] = {0,}; struct net_device *dev; int rmcast = 0; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) return -ENOMEM; INIT_LIST_HEAD(&ub->rcast.list); if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attrs[TIPC_NLA_BEARER_UDP_OPTS], tipc_nl_udp_policy, NULL)) goto err; if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) { pr_err("Invalid UDP bearer configuration"); err = -EINVAL; goto err; } err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local, &ub->ifindex); if (err) goto err; err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL); if (err) goto err; if (remote.proto != local.proto) { err = -EINVAL; goto err; } /* Checking remote ip address */ rmcast = tipc_udp_is_mcast_addr(&remote); /* Autoconfigure own node identity if needed */ if (!tipc_own_id(net)) { memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16); tipc_net_init(net, node_id, 0); } if (!tipc_own_id(net)) { pr_warn("Failed to set node id, please configure manually\n"); err = -EINVAL; goto err; } b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; rcu_assign_pointer(b->media_ptr, ub); rcu_assign_pointer(ub->bearer, b); tipc_udp_media_addr_set(&b->addr, &local); if (local.proto == htons(ETH_P_IP)) { dev = __ip_dev_find(net, local.ipv4.s_addr, false); if (!dev) { err = -ENODEV; goto err; } udp_conf.family = AF_INET; /* Switch to use ANY to receive packets from group */ if (rmcast) udp_conf.local_ip.s_addr = htonl(INADDR_ANY); else udp_conf.local_ip.s_addr = local.ipv4.s_addr; udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; b->encap_hlen = sizeof(struct iphdr) + sizeof(struct udphdr); b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { dev = ub->ifindex ? __dev_get_by_index(net, ub->ifindex) : NULL; dev = ipv6_dev_find(net, &local.ipv6, dev); if (!dev) { err = -ENODEV; goto err; } udp_conf.family = AF_INET6; udp_conf.use_udp6_tx_checksums = true; udp_conf.use_udp6_rx_checksums = true; if (rmcast) udp_conf.local_ip6 = in6addr_any; else udp_conf.local_ip6 = local.ipv6; ub->ifindex = dev->ifindex; b->encap_hlen = sizeof(struct ipv6hdr) + sizeof(struct udphdr); b->mtu = 1280; #endif } else { err = -EAFNOSUPPORT; goto err; } udp_conf.local_udp_port = local.port; err = udp_sock_create(net, &udp_conf, &ub->ubsock); if (err) goto err; tuncfg.sk_user_data = ub; tuncfg.encap_type = 1; tuncfg.encap_rcv = tipc_udp_recv; tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); err = dst_cache_init(&ub->rcast.dst_cache, GFP_ATOMIC); if (err) goto free; /* * The bcast media address port is used for all peers and the ip * is used if it's a multicast address. */ memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); if (rmcast) err = enable_mcast(ub, &remote); else err = tipc_udp_rcast_add(b, &remote); if (err) goto free; return 0; free: dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); err: kfree(ub); return err; } /* cleanup_bearer - break the socket/bearer association */ static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); struct udp_replicast *rcast, *tmp; struct tipc_net *tn; list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { dst_cache_destroy(&rcast->dst_cache); list_del_rcu(&rcast->list); kfree_rcu(rcast, rcu); } tn = tipc_net(sock_net(ub->ubsock->sk)); dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); /* Note: could use a call_rcu() to avoid another synchronize_net() */ synchronize_net(); atomic_dec(&tn->wq_count); kfree(ub); } /* tipc_udp_disable - detach bearer from socket */ static void tipc_udp_disable(struct tipc_bearer *b) { struct udp_bearer *ub; ub = rtnl_dereference(b->media_ptr); if (!ub) { pr_err("UDP bearer instance not found\n"); return; } sock_set_flag(ub->ubsock->sk, SOCK_DEAD); RCU_INIT_POINTER(ub->bearer, NULL); /* sock_release need to be done outside of rtnl lock */ atomic_inc(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); INIT_WORK(&ub->work, cleanup_bearer); schedule_work(&ub->work); } struct tipc_media udp_media_info = { .send_msg = tipc_udp_send_msg, .enable_media = tipc_udp_enable, .disable_media = tipc_udp_disable, .addr2str = tipc_udp_addr2str, .addr2msg = tipc_udp_addr2msg, .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, .name = "udp" };
8 1 3 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/ts_fsm.c A naive finite state machine text search approach * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * A finite state machine consists of n states (struct ts_fsm_token) * representing the pattern as a finite automaton. The data is read * sequentially on an octet basis. Every state token specifies the number * of recurrences and the type of value accepted which can be either a * specific character or ctype based set of characters. The available * type of recurrences include 1, (0|1), [0 n], and [1 n]. * * The algorithm differs between strict/non-strict mode specifying * whether the pattern has to start at the first octet. Strict mode * is enabled by default and can be disabled by inserting * TS_FSM_HEAD_IGNORE as the first token in the chain. * * The runtime performance of the algorithm should be around O(n), * however while in strict mode the average runtime can be better. */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/textsearch.h> #include <linux/textsearch_fsm.h> struct ts_fsm { unsigned int ntokens; struct ts_fsm_token tokens[]; }; /* other values derived from ctype.h */ #define _A 0x100 /* ascii */ #define _W 0x200 /* wildcard */ /* Map to _ctype flags and some magic numbers */ static const u16 token_map[TS_FSM_TYPE_MAX+1] = { [TS_FSM_SPECIFIC] = 0, [TS_FSM_WILDCARD] = _W, [TS_FSM_CNTRL] = _C, [TS_FSM_LOWER] = _L, [TS_FSM_UPPER] = _U, [TS_FSM_PUNCT] = _P, [TS_FSM_SPACE] = _S, [TS_FSM_DIGIT] = _D, [TS_FSM_XDIGIT] = _D | _X, [TS_FSM_ALPHA] = _U | _L, [TS_FSM_ALNUM] = _U | _L | _D, [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, [TS_FSM_GRAPH] = _P | _U | _L | _D, [TS_FSM_ASCII] = _A, }; static const u16 token_lookup_tbl[256] = { _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ _W, _W, _W, _W, /* 128-131 */ _W, _W, _W, _W, /* 132-135 */ _W, _W, _W, _W, /* 136-139 */ _W, _W, _W, _W, /* 140-143 */ _W, _W, _W, _W, /* 144-147 */ _W, _W, _W, _W, /* 148-151 */ _W, _W, _W, _W, /* 152-155 */ _W, _W, _W, _W, /* 156-159 */ _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ static inline int match_token(struct ts_fsm_token *t, u8 d) { if (t->type) return (token_lookup_tbl[d] & t->type) != 0; else return t->value == d; } static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) { struct ts_fsm *fsm = ts_config_priv(conf); struct ts_fsm_token *cur = NULL, *next; unsigned int match_start, block_idx = 0, tok_idx; unsigned block_len = 0, strict, consumed = state->offset; const u8 *data; #define GET_NEXT_BLOCK() \ ({ consumed += block_idx; \ block_idx = 0; \ block_len = conf->get_next_block(consumed, &data, conf, state); }) #define TOKEN_MISMATCH() \ do { \ if (strict) \ goto no_match; \ block_idx++; \ goto startover; \ } while(0) #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) if (end_of_data()) goto no_match; strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; startover: match_start = consumed + block_idx; for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { cur = &fsm->tokens[tok_idx]; if (likely(tok_idx < (fsm->ntokens - 1))) next = &fsm->tokens[tok_idx + 1]; else next = NULL; switch (cur->recur) { case TS_FSM_SINGLE: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); break; case TS_FSM_PERHAPS: if (end_of_data() || !match_token(cur, data[block_idx])) continue; break; case TS_FSM_MULTI: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; fallthrough; case TS_FSM_ANY: if (next == NULL) goto found_match; if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; if (end_of_data()) goto no_match; } continue; /* * Optimization: Prefer small local loop over jumping * back and forth until garbage at head is munched. */ case TS_FSM_HEAD_IGNORE: if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { /* * Special case, don't start over upon * a mismatch, give the user the * chance to specify the type of data * allowed to be ignored. */ if (!match_token(cur, data[block_idx])) goto no_match; block_idx++; if (end_of_data()) goto no_match; } match_start = consumed + block_idx; continue; } block_idx++; } if (end_of_data()) goto found_match; no_match: return UINT_MAX; found_match: state->offset = consumed + block_idx; return match_start; } static struct ts_config *fsm_init(const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { int i, err = -EINVAL; struct ts_config *conf; struct ts_fsm *fsm; struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; unsigned int ntokens = len / sizeof(*tokens); size_t priv_size = sizeof(*fsm) + len; if (len % sizeof(struct ts_fsm_token) || ntokens < 1) goto errout; if (flags & TS_IGNORECASE) goto errout; for (i = 0; i < ntokens; i++) { struct ts_fsm_token *t = &tokens[i]; if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) goto errout; if (t->recur == TS_FSM_HEAD_IGNORE && (i != 0 || i == (ntokens - 1))) goto errout; } conf = alloc_ts_config(priv_size, gfp_mask); if (IS_ERR(conf)) return conf; conf->flags = flags; fsm = ts_config_priv(conf); fsm->ntokens = ntokens; memcpy(fsm->tokens, pattern, len); for (i = 0; i < fsm->ntokens; i++) { struct ts_fsm_token *t = &fsm->tokens[i]; t->type = token_map[t->type]; } return conf; errout: return ERR_PTR(err); } static void *fsm_get_pattern(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->tokens; } static unsigned int fsm_get_pattern_len(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->ntokens * sizeof(struct ts_fsm_token); } static struct ts_ops fsm_ops = { .name = "fsm", .find = fsm_find, .init = fsm_init, .get_pattern = fsm_get_pattern, .get_pattern_len = fsm_get_pattern_len, .owner = THIS_MODULE, .list = LIST_HEAD_INIT(fsm_ops.list) }; static int __init init_fsm(void) { return textsearch_register(&fsm_ops); } static void __exit exit_fsm(void) { textsearch_unregister(&fsm_ops); } MODULE_DESCRIPTION("naive finite state machine text search"); MODULE_LICENSE("GPL"); module_init(init_fsm); module_exit(exit_fsm);
553 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/tcp.h> #include <linux/vmalloc.h> #include <net/request_sock.h> /* * Maximum number of SYN_RECV sockets in queue per LISTEN socket. * One SYN_RECV socket costs about 80bytes on a 32bit machine. * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. * * The minimum value of it is 128. Experiments with real servers show that * it is absolutely not enough even at 100conn/sec. 256 cures most * of problems. * This value is adjusted to 128 for low memory machines, * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ void reqsk_queue_alloc(struct request_sock_queue *queue) { queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; queue->rskq_accept_head = NULL; } /* * This function is called to set a Fast Open socket's "fastopen_rsk" field * to NULL when a TFO socket no longer needs to access the request_sock. * This happens only after 3WHS has been either completed or aborted (e.g., * RST is received). * * Before TFO, a child socket is created only after 3WHS is completed, * hence it never needs to access the request_sock. things get a lot more * complex with TFO. A child socket, accepted or not, has to access its * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, * until 3WHS is either completed or aborted. Afterwards the req will stay * until either the child socket is accepted, or in the rare case when the * listener is closed before the child is accepted. * * In short, a request socket is only freed after BOTH 3WHS has completed * (or aborted) and the child socket has been accepted (or listener closed). * When a child socket is accepted, its corresponding req->sk is set to * NULL since it's no longer needed. More importantly, "req->sk == NULL" * will be used by the code below to determine if a child socket has been * accepted or not, and the check is protected by the fastopenq->lock * described below. * * Note that fastopen_rsk is only accessed from the child socket's context * with its socket lock held. But a request_sock (req) can be accessed by * both its child socket through fastopen_rsk, and a listener socket through * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. * only in the rare case when both the listener and the child locks are held, * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. * The lock also protects other fields such as fastopenq->qlen, which is * decremented by this function when fastopen_rsk is no longer needed. * * Note that another solution was to simply use the existing socket lock * from the listener. But first socket lock is difficult to use. It is not * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to * acquire a child's lock while holding listener's socket lock. A corner * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; if (!reset || lsk->sk_state != TCP_LISTEN) { /* If the listener has been closed don't bother with the * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. * This is a simple defense against TFO spoofing attack - by * counting the req against fastopen.max_qlen, and disabling * TFO when the qlen exceeds max_qlen. * * For more details see CoNext'11 "TCP Fast Open" paper. */ req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else fastopenq->rskq_rst_tail->dl_next = req; req->dl_next = NULL; fastopenq->rskq_rst_tail = req; fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); }
268 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Symmetric key ciphers. * * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_SKCIPHER_H #define _CRYPTO_INTERNAL_SKCIPHER_H #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <crypto/scatterwalk.h> #include <crypto/skcipher.h> #include <linux/types.h> /* * Set this if your algorithm is sync but needs a reqsize larger * than MAX_SYNC_SKCIPHER_REQSIZE. * * Reuse bit that is specific to hash algorithms. */ #define CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE CRYPTO_ALG_OPTIONAL_KEY struct aead_request; struct rtattr; struct skcipher_instance { void (*free)(struct skcipher_instance *inst); union { struct { char head[offsetof(struct skcipher_alg, base)]; struct crypto_instance base; } s; struct skcipher_alg alg; }; }; struct lskcipher_instance { void (*free)(struct lskcipher_instance *inst); union { struct { char head[offsetof(struct lskcipher_alg, co.base)]; struct crypto_instance base; } s; struct lskcipher_alg alg; }; }; struct crypto_skcipher_spawn { struct crypto_spawn base; }; struct crypto_lskcipher_spawn { struct crypto_spawn base; }; static inline struct crypto_instance *skcipher_crypto_instance( struct skcipher_instance *inst) { return &inst->s.base; } static inline struct crypto_instance *lskcipher_crypto_instance( struct lskcipher_instance *inst) { return &inst->s.base; } static inline struct skcipher_instance *skcipher_alg_instance( struct crypto_skcipher *skcipher) { return container_of(crypto_skcipher_alg(skcipher), struct skcipher_instance, alg); } static inline struct lskcipher_instance *lskcipher_alg_instance( struct crypto_lskcipher *lskcipher) { return container_of(crypto_lskcipher_alg(lskcipher), struct lskcipher_instance, alg); } static inline void *skcipher_instance_ctx(struct skcipher_instance *inst) { return crypto_instance_ctx(skcipher_crypto_instance(inst)); } static inline void *lskcipher_instance_ctx(struct lskcipher_instance *inst) { return crypto_instance_ctx(lskcipher_crypto_instance(inst)); } static inline void skcipher_request_complete(struct skcipher_request *req, int err) { crypto_request_complete(&req->base, err); } int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); int crypto_grab_lskcipher(struct crypto_lskcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_skcipher(struct crypto_skcipher_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline void crypto_drop_lskcipher(struct crypto_lskcipher_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct lskcipher_alg *crypto_lskcipher_spawn_alg( struct crypto_lskcipher_spawn *spawn) { return container_of(spawn->base.alg, struct lskcipher_alg, co.base); } static inline struct skcipher_alg_common *crypto_spawn_skcipher_alg_common( struct crypto_skcipher_spawn *spawn) { return container_of(spawn->base.alg, struct skcipher_alg_common, base); } static inline struct lskcipher_alg *crypto_spawn_lskcipher_alg( struct crypto_lskcipher_spawn *spawn) { return crypto_lskcipher_spawn_alg(spawn); } static inline struct crypto_skcipher *crypto_spawn_skcipher( struct crypto_skcipher_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline struct crypto_lskcipher *crypto_spawn_lskcipher( struct crypto_lskcipher_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void crypto_skcipher_set_reqsize( struct crypto_skcipher *skcipher, unsigned int reqsize) { skcipher->reqsize = reqsize; } static inline void crypto_skcipher_set_reqsize_dma( struct crypto_skcipher *skcipher, unsigned int reqsize) { reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); skcipher->reqsize = reqsize; } int crypto_register_skcipher(struct skcipher_alg *alg); void crypto_unregister_skcipher(struct skcipher_alg *alg); int crypto_register_skciphers(struct skcipher_alg *algs, int count); void crypto_unregister_skciphers(struct skcipher_alg *algs, int count); int skcipher_register_instance(struct crypto_template *tmpl, struct skcipher_instance *inst); int crypto_register_lskcipher(struct lskcipher_alg *alg); void crypto_unregister_lskcipher(struct lskcipher_alg *alg); int crypto_register_lskciphers(struct lskcipher_alg *algs, int count); void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count); int lskcipher_register_instance(struct crypto_template *tmpl, struct lskcipher_instance *inst); int skcipher_walk_virt(struct skcipher_walk *__restrict walk, struct skcipher_request *__restrict req, bool atomic); int skcipher_walk_aead_encrypt(struct skcipher_walk *__restrict walk, struct aead_request *__restrict req, bool atomic); int skcipher_walk_aead_decrypt(struct skcipher_walk *__restrict walk, struct aead_request *__restrict req, bool atomic); static inline void *crypto_skcipher_ctx(struct crypto_skcipher *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline void *crypto_lskcipher_ctx(struct crypto_lskcipher *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline void *crypto_skcipher_ctx_dma(struct crypto_skcipher *tfm) { return crypto_tfm_ctx_dma(&tfm->base); } static inline void *skcipher_request_ctx(struct skcipher_request *req) { return req->__ctx; } static inline void *skcipher_request_ctx_dma(struct skcipher_request *req) { unsigned int align = crypto_dma_align(); if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(skcipher_request_ctx(req), align); } static inline u32 skcipher_request_flags(struct skcipher_request *req) { return req->base.flags; } /* Helpers for simple block cipher modes of operation */ struct skcipher_ctx_simple { struct crypto_cipher *cipher; /* underlying block cipher */ }; static inline struct crypto_cipher * skcipher_cipher_simple(struct crypto_skcipher *tfm) { struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); return ctx->cipher; } struct skcipher_instance *skcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb); static inline struct crypto_alg *skcipher_ialg_simple( struct skcipher_instance *inst) { struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst); return crypto_spawn_cipher_alg(spawn); } static inline struct crypto_lskcipher *lskcipher_cipher_simple( struct crypto_lskcipher *tfm) { struct crypto_lskcipher **ctx = crypto_lskcipher_ctx(tfm); return *ctx; } struct lskcipher_instance *lskcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb); static inline struct lskcipher_alg *lskcipher_ialg_simple( struct lskcipher_instance *inst) { struct crypto_lskcipher_spawn *spawn = lskcipher_instance_ctx(inst); return crypto_lskcipher_spawn_alg(spawn); } #endif /* _CRYPTO_INTERNAL_SKCIPHER_H */
1467 1468 1467 1468 3007 3008 2446 2444 2447 2446 2445 2447 3015 3016 2447 3014 1468 1466 1469 1469 3002 3003 2439 2437 2441 2270 611 3015 3014 3014 2442 3016 1469 1390 87 1398 1398 41 41 41 1151 3084 3083 1386 1388 3115 2430 1398 1398 1398 2431 2431 2432 17 17 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 // SPDX-License-Identifier: GPL-2.0-or-later /* * net-sysfs.c - network device class and attributes * * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/sched/isolation.h> #include <linux/nsproxy.h> #include <net/sock.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/pm_runtime.h> #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/rps.h> #include "dev.h" #include "net-sysfs.h" #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; /* Caller holds RTNL, netdev->lock or RCU */ static inline int dev_isalive(const struct net_device *dev) { return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } /* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, * when unregistering a net device and accessing associated sysfs files. The * potential deadlock is as follow: * * CPU 0 CPU 1 * * rtnl_lock vfs_read * unregister_netdevice_many kernfs_seq_start * device_del / kobject_put kernfs_get_active (kn->active++) * kernfs_drain sysfs_kf_seq_show * wait_event( rtnl_lock * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release * -> waits on CPU 1 to decrease kn->active the rtnl lock. * * The historical fix was to use rtnl_trylock with restart_syscall to bail out * of sysfs operations when the lock couldn't be taken. This fixed the above * issue as it allowed CPU 1 to bail out of the ABBA situation. * * But it came with performances issues, as syscalls are being restarted in * loops when there was contention on the rtnl lock, with huge slow downs in * specific scenarios (e.g. lots of virtual interfaces created and userspace * daemons querying their attributes). * * The idea below is to bail out of the active kernfs_node protection * (kn->active) while trying to take the rtnl lock. * * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The * net device is guaranteed to be alive if this returns successfully. */ static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, struct net_device *ndev) { struct kernfs_node *kn; int ret = 0; /* First, we hold a reference to the net device as the unregistration * path might run in parallel. This will ensure the net device and the * associated sysfs objects won't be freed while we try to take the rtnl * lock. */ dev_hold(ndev); /* sysfs_break_active_protection was introduced to allow self-removal of * devices and their associated sysfs files by bailing out of the * sysfs/kernfs protection. We do this here to allow the unregistration * path to complete in parallel. The following takes a reference on the * kobject and the kernfs_node being accessed. * * This works because we hold a reference onto the net device and the * unregistration path will wait for us eventually in netdev_run_todo * (outside an rtnl lock section). */ kn = sysfs_break_active_protection(kobj, attr); /* We can now try to take the rtnl lock. This can't deadlock us as the * unregistration path is able to drain sysfs files (kernfs_node) thanks * to the above dance. */ if (rtnl_lock_interruptible()) { ret = -ERESTARTSYS; goto unbreak; } /* Check dismantle on the device hasn't started, otherwise deny the * operation. */ if (!dev_isalive(ndev)) { rtnl_unlock(); ret = -ENODEV; goto unbreak; } /* We are now sure the device dismantle hasn't started nor that it can * start before we exit the locking section as we hold the rtnl lock. * There's no need to keep unbreaking the sysfs protection nor to hold * a net device reference from that point; that was only needed to take * the rtnl lock. */ unbreak: sysfs_unbreak_active_protection(kn); dev_put(ndev); return ret; } /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); rcu_read_unlock(); return ret; } /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ return netdev_show(dev, attr, buf, format_##field); \ } \ #define NETDEVICE_SHOW_RO(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RO(field) #define NETDEVICE_SHOW_RW(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RW(field) /* use same locking and permission rules as SIF* ioctl's */ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); if (ret) goto err; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) goto err; ret = (*set)(netdev, new); if (ret == 0) ret = len; rtnl_unlock(); err: return ret; } /* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */ static ssize_t netdev_lock_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); if (ret) return ret; netdev_lock(netdev); if (dev_isalive(netdev)) { ret = (*set)(netdev, new); if (ret == 0) ret = len; } netdev_unlock(netdev); return ret; } NETDEVICE_SHOW_RO(dev_id, fmt_hex); NETDEVICE_SHOW_RO(dev_port, fmt_dec); NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); NETDEVICE_SHOW_RO(addr_len, fmt_dec); NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); } static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); /* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; down_read(&dev_addr_sem); rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); int ret = -EINVAL; rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); rcu_read_unlock(); return ret; } static DEVICE_ATTR_RO(broadcast); static int change_carrier(struct net_device *dev, unsigned long new_carrier) { if (!netif_running(dev)) return -EINVAL; return dev_change_carrier(dev, (bool)new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; return netdev_store(dev, attr, buf, len, change_carrier); } static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, * see also rtnl_getlink(). */ linkwatch_sync_dev(netdev); ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(speed); static ssize_t duplex_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; case DUPLEX_FULL: duplex = "full"; break; default: duplex = "unknown"; break; } ret = sysfs_emit(buf, "%s\n", duplex); } } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(duplex); static ssize_t testing_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(testing); static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(dormant); static const char *const operstates[] = { "unknown", "notpresent", /* currently unused */ "down", "lowerlayerdown", "testing", "dormant", "up" }; static ssize_t operstate_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ return sysfs_emit(buf, "%s\n", operstates[operstate]); } static DEVICE_ATTR_RO(operstate); static ssize_t carrier_changes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count) + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); static ssize_t carrier_up_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); } static DEVICE_ATTR_RO(carrier_up_count); static ssize_t carrier_down_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_down_count); /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) { return dev_set_mtu(dev, (int)new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_mtu); } NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_flags); } NETDEVICE_SHOW_RW(flags, fmt_hex); static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); } NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { netdev_set_gro_flush_timeout(dev, val); return 0; } static ssize_t gro_flush_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout); } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { if (val > S32_MAX) return -ERANGE; netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } static ssize_t napi_defer_hard_irqs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_lock_store(dev, attr, buf, len, change_napi_defer_hard_irqs); } NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; /* ignore trailing newline */ if (len > 0 && buf[len - 1] == '\n') --count; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_set_alias(netdev, buf, count); if (ret < 0) goto err; ret = len; netdev_state_change(netdev); err: rtnl_unlock(); return ret; } static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) ret = sysfs_emit(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); static int change_group(struct net_device *dev, unsigned long new_group) { dev_set_group(dev, (int)new_group); return 0; } static ssize_t group_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_group); } NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); static int change_proto_down(struct net_device *dev, unsigned long proto_down) { return dev_change_proto_down(dev, (bool)proto_down); } static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid; ssize_t ret; /* The check is also done in dev_get_phys_port_id; this helps returning * early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_id); static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); char name[IFNAMSIZ]; ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return -EOPNOTSUPP; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_phys_port_name(netdev, name, sizeof(name)); if (!ret) ret = sysfs_emit(buf, "%s\n", name); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_name); static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid = { }; ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the locking section below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return -EOPNOTSUPP; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_port_parent_id(netdev, &ppid, false); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_switch_id); static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; rcu_read_lock(); if (dev_isalive(netdev)) ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); rcu_read_unlock(); return ret; } static int modify_napi_threaded(struct net_device *dev, unsigned long val) { int ret; if (list_empty(&dev->napi_list)) return -EOPNOTSUPP; if (val != 0 && val != 1) return -EOPNOTSUPP; ret = dev_set_threaded(dev, val); return ret; } static ssize_t threaded_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded); } static DEVICE_ATTR_RW(threaded); static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, &dev_attr_dev_id.attr, &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, &dev_attr_address.attr, &dev_attr_broadcast.attr, &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, &dev_attr_carrier.attr, &dev_attr_mtu.attr, &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, struct device_attribute *attr, char *buf, unsigned long offset) { struct net_device *dev = to_net_dev(d); ssize_t ret = -EINVAL; WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } rcu_read_unlock(); return ret; } /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ } \ static DEVICE_ATTR_RO(name) NETSTAT_ENTRY(rx_packets); NETSTAT_ENTRY(tx_packets); NETSTAT_ENTRY(rx_bytes); NETSTAT_ENTRY(tx_bytes); NETSTAT_ENTRY(rx_errors); NETSTAT_ENTRY(tx_errors); NETSTAT_ENTRY(rx_dropped); NETSTAT_ENTRY(tx_dropped); NETSTAT_ENTRY(multicast); NETSTAT_ENTRY(collisions); NETSTAT_ENTRY(rx_length_errors); NETSTAT_ENTRY(rx_over_errors); NETSTAT_ENTRY(rx_crc_errors); NETSTAT_ENTRY(rx_frame_errors); NETSTAT_ENTRY(rx_fifo_errors); NETSTAT_ENTRY(rx_missed_errors); NETSTAT_ENTRY(tx_aborted_errors); NETSTAT_ENTRY(tx_carrier_errors); NETSTAT_ENTRY(tx_fifo_errors); NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] __ro_after_init = { &dev_attr_rx_packets.attr, &dev_attr_tx_packets.attr, &dev_attr_rx_bytes.attr, &dev_attr_tx_bytes.attr, &dev_attr_rx_errors.attr, &dev_attr_tx_errors.attr, &dev_attr_rx_dropped.attr, &dev_attr_tx_dropped.attr, &dev_attr_multicast.attr, &dev_attr_collisions.attr, &dev_attr_rx_length_errors.attr, &dev_attr_rx_over_errors.attr, &dev_attr_rx_crc_errors.attr, &dev_attr_rx_frame_errors.attr, &dev_attr_rx_fifo_errors.attr, &dev_attr_rx_missed_errors.attr, &dev_attr_tx_aborted_errors.attr, &dev_attr_tx_carrier_errors.attr, &dev_attr_tx_fifo_errors.attr, &dev_attr_tx_heartbeat_errors.attr, &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, &dev_attr_rx_nohandler.attr, NULL }; static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; static struct attribute *wireless_attrs[] = { NULL }; static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; static bool wireless_group_needed(struct net_device *ndev) { #if IS_ENABLED(CONFIG_CFG80211) if (ndev->ieee80211_ptr) return true; #endif #if IS_ENABLED(CONFIG_WIRELESS_EXT) if (ndev->wireless_handlers) return true; #endif return false; } #else /* CONFIG_SYSFS */ #define net_class_groups NULL #endif /* CONFIG_SYSFS */ #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(queue, buf); } static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(queue, buf, count); } static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; #ifdef CONFIG_RPS static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) { struct rps_map *map; cpumask_var_t mask; int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; rcu_read_lock(); map = rcu_dereference(queue->rps_map); if (map) for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; } static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, cpumask_var_t mask) { static DEFINE_MUTEX(rps_map_mutex); struct rps_map *old_map, *map; int cpu, i; map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); if (!map) return -ENOMEM; i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) map->cpus[i++] = cpu; if (i) { map->len = i; } else { kfree(map); map = NULL; } mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); if (map) static_branch_inc(&rps_needed); if (old_map) static_branch_dec(&rps_needed); mutex_unlock(&rps_map_mutex); if (old_map) kfree_rcu(old_map, rcu); return 0; } int rps_cpumask_housekeeping(struct cpumask *mask) { if (!cpumask_empty(mask)) { cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); if (cpumask_empty(mask)) return -EINVAL; } return 0; } static ssize_t store_rps_map(struct netdev_rx_queue *queue, const char *buf, size_t len) { cpumask_var_t mask; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) goto out; err = rps_cpumask_housekeeping(mask); if (err) goto out; err = netdev_rx_queue_set_rps_mask(queue, mask); out: free_cpumask_var(mask); return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { struct rps_dev_flow_table *flow_table; unsigned long val = 0; rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) val = 1UL << flow_table->log; rcu_read_unlock(); return sysfs_emit(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; static DEFINE_SPINLOCK(rps_dev_flow_lock); int rc; if (!capable(CAP_NET_ADMIN)) return -EPERM; rc = kstrtoul(buf, 0, &count); if (rc < 0) return rc; if (count) { mask = count - 1; /* mask = roundup_pow_of_two(count) - 1; * without overflows... */ while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); /* On 64 bit arches, must check mask fits in table->mask (u32), * and on 32bit arches, must check * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. */ #if BITS_PER_LONG > 32 if (mask > (unsigned long)(u32)mask) return -EINVAL; #else if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) / sizeof(struct rps_dev_flow)) { /* Enforce a limit to prevent overflow */ return -EINVAL; } #endif table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); if (!table) return -ENOMEM; table->log = ilog2(mask) + 1; for (count = 0; count <= mask; count++) table->flows[count].cpu = RPS_NO_CPU; } else { table = NULL; } spin_lock(&rps_dev_flow_lock); old_table = rcu_dereference_protected(queue->rps_flow_table, lockdep_is_held(&rps_dev_flow_lock)); rcu_assign_pointer(queue->rps_flow_table, table); spin_unlock(&rps_dev_flow_lock); if (old_table) call_rcu(&old_table->rcu, rps_dev_flow_table_release); return len; } static struct rx_queue_attribute rps_cpus_attribute __ro_after_init = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init = __ATTR(rps_flow_cnt, 0644, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); #endif /* CONFIG_RPS */ static struct attribute *rx_queue_default_attrs[] __ro_after_init = { #ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, #endif NULL }; ATTRIBUTE_GROUPS(rx_queue_default); static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); #ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); kfree_rcu(map, rcu); } flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); if (flow_table) { RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } #endif memset(kobj, 0, sizeof(*kobj)); netdev_put(queue->dev, &queue->dev_tracker); } static const void *rx_queue_namespace(const struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; } static void rx_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = rx_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; static int rx_queue_default_mask(struct net_device *dev, struct netdev_rx_queue *queue) { #if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) struct cpumask *rps_default_mask = READ_ONCE(dev_net(dev)->core.rps_default_mask); if (rps_default_mask && !cpumask_empty(rps_default_mask)) return netdev_rx_queue_set_rps_mask(queue, rps_default_mask); #endif return 0; } static int rx_queue_add_kobject(struct net_device *dev, int index) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Rx queues are cleared in rx_queue_release to allow later * re-registration. This is triggered when their kobj refcount is * dropped. * * If a queue is removed while both a read (or write) operation and a * the re-addition of the same queue are pending (waiting on rntl_lock) * it might happen that the re-addition will execute before the read, * making the initial removal to never happen (queue's kobj refcount * won't drop enough because of the pending read). In such rare case, * return to allow the removal operation to complete. */ if (unlikely(kobj->state_initialized)) { netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); return -EAGAIN; } /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) goto err; queue->groups = rx_queue_default_groups; error = sysfs_create_groups(kobj, queue->groups); if (error) goto err; if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; err_default_groups: sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; } static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; if (dev->sysfs_rx_queue_group) error = sysfs_group_change_owner( kobj, dev->sysfs_rx_queue_group, kuid, kgid); return error; } #endif /* CONFIG_SYSFS */ int net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } return error; #else return 0; #endif } static int net_rx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = 0; i < num; i++) { error = rx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif } #ifdef CONFIG_SYSFS /* * netdev_queue sysfs structures and functions. */ struct netdev_queue_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf); ssize_t (*store)(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) static ssize_t netdev_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { .show = netdev_queue_attr_show, .store = netdev_queue_attr_store, }; static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); return sysfs_emit(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; unsigned int i; i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; } static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; ret = sysfs_rtnl_lock(kobj, attr, queue->dev); if (ret) return ret; index = get_netdev_queue_index(queue); /* If queue belongs to subordinate dev use its TC mapping */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); rtnl_unlock(); if (tc < 0) return -EINVAL; /* We can report the traffic class one of two ways: * Subordinate device traffic classes are reported with the traffic * class first, and then the subordinate class so for example TC0 on * subordinate device 2 will be reported as "0-2". If the queue * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : sysfs_emit(buf, "%d\n", tc); } #ifdef CONFIG_XPS static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { int err, index = get_netdev_queue_index(queue); struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; err = kstrtou32(buf, 10, &rate); if (err < 0) return err; err = sysfs_rtnl_lock(kobj, attr, dev); if (err) return err; err = -EOPNOTSUPP; netdev_lock_ops(dev); if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); netdev_unlock_ops(dev); if (!err) { queue->tx_maxrate = rate; rtnl_unlock(); return len; } rtnl_unlock(); return err; } static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init = __ATTR_RW(tx_maxrate); #endif static struct netdev_queue_attribute queue_trans_timeout __ro_after_init = __ATTR_RO(tx_timeout); static struct netdev_queue_attribute queue_traffic_class __ro_after_init = __ATTR_RO(traffic_class); #ifdef CONFIG_BQL /* * Byte queue limits sysfs structures and functions. */ static ssize_t bql_show(char *buf, unsigned int value) { return sysfs_emit(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, unsigned int *pvalue) { unsigned int value; int err; if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) { value = DQL_MAX_LIMIT; } else { err = kstrtouint(buf, 10, &value); if (err < 0) return err; if (value > DQL_MAX_LIMIT) return -EINVAL; } *pvalue = value; return count; } static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct dql *dql = &queue->dql; unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err < 0) return err; dql->slack_hold_time = msecs_to_jiffies(value); return len; } static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct dql *dql = &queue->dql; unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err < 0) return err; value = msecs_to_jiffies(value); if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG)) return -ERANGE; if (!dql->stall_thrs && value) dql->last_reap = jiffies; /* Force last_reap to be live */ smp_wmb(); dql->stall_thrs = value; return len; } static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; } static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%lu\n", dql->stall_cnt); } static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ struct attribute *attr, \ struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ struct attribute *attr, \ struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ } \ \ static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ = __ATTR(NAME, 0644, \ bql_show_ ## NAME, bql_set_ ## NAME) BQL_ATTR(limit, limit); BQL_ATTR(limit_max, max_limit); BQL_ATTR(limit_min, min_limit); static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_attribute.attr, &bql_limit_max_attribute.attr, &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, &bql_stall_thrs_attribute.attr, &bql_stall_cnt_attribute.attr, &bql_stall_max_attribute.attr, NULL }; static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; #else /* Fake declaration, all the code using it should be dead */ static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, int tc, char *buf, enum xps_map_type type) { struct xps_dev_maps *dev_maps; unsigned long *mask; unsigned int nr_ids; int j, len; rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[type]); /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 * when dev_maps hasn't been allocated yet, to be backward compatible. */ nr_ids = dev_maps ? dev_maps->nr_ids : (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); if (!mask) { rcu_read_unlock(); return -ENOMEM; } if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; for (i = map->len; i--;) { if (map->queues[i] == index) { __set_bit(j, mask); break; } } } out_no_maps: rcu_read_unlock(); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; } static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); ret = sysfs_rtnl_lock(kobj, attr, queue->dev); if (ret) return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) { rtnl_unlock(); return -EINVAL; } /* Increase the net device refcnt to make sure it won't be freed while * xps_queue_show is running. */ dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); dev_put(dev); return len; } static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; unsigned int index; cpumask_var_t mask; int err; if (!netif_is_multiqueue(dev)) return -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) { free_cpumask_var(mask); return err; } err = sysfs_rtnl_lock(kobj, attr, dev); if (err) { free_cpumask_var(mask); return err; } err = netif_set_xps_queue(dev, mask, index); rtnl_unlock(); free_cpumask_var(mask); return err ? : len; } static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int tc, ret; index = get_netdev_queue_index(queue); ret = sysfs_rtnl_lock(kobj, attr, dev); if (ret) return ret; tc = netdev_txq_to_tc(dev, index); /* Increase the net device refcnt to make sure it won't be freed while * xps_queue_show is running. */ dev_hold(dev); rtnl_unlock(); ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; dev_put(dev); return ret; } static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; struct net *net = dev_net(dev); unsigned long *mask; unsigned int index; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); if (!mask) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, mask, dev->num_rx_queues); if (err) { bitmap_free(mask); return err; } err = sysfs_rtnl_lock(kobj, attr, dev); if (err) { bitmap_free(mask); return err; } cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); bitmap_free(mask); return err ? : len; } static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_trans_timeout.attr, &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL }; ATTRIBUTE_GROUPS(netdev_queue_default); static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); netdev_put(queue->dev, &queue->dev_tracker); } static const void *netdev_queue_namespace(const struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; } static void netdev_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = netdev_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; static bool netdev_uses_bql(const struct net_device *dev) { if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) return false; return IS_ENABLED(CONFIG_BQL); } static int netdev_queue_add_kobject(struct net_device *dev, int index) { struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Tx queues are cleared in netdev_queue_release to allow later * re-registration. This is triggered when their kobj refcount is * dropped. * * If a queue is removed while both a read (or write) operation and a * the re-addition of the same queue are pending (waiting on rntl_lock) * it might happen that the re-addition will execute before the read, * making the initial removal to never happen (queue's kobj refcount * won't drop enough because of the pending read). In such rare case, * return to allow the removal operation to complete. */ if (unlikely(kobj->state_initialized)) { netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); return -EAGAIN; } /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) goto err; queue->groups = netdev_queue_default_groups; error = sysfs_create_groups(kobj, queue->groups); if (error) goto err; if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; err_default_groups: sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; } static int tx_queue_change_owner(struct net_device *ndev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_queue *queue = ndev->_tx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; if (netdev_uses_bql(ndev)) error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); return error; } #endif /* CONFIG_SYSFS */ int netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; /* Tx queue kobjects are allowed to be updated when a device is being * unregistered, but solely to remove queues from qdiscs. Any path * adding queues should be fixed. */ WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num, "New queues can't be registered after device unregistration."); for (i = old_num; i < new_num; i++) { error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int net_tx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; for (i = 0; i < num; i++) { error = tx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS dev->queues_kset = kset_create_and_add("queues", NULL, &dev->dev.kobj); if (!dev->queues_kset) return -ENOMEM; real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; return 0; error: netdev_queue_update_kobjects(dev, txq, 0); net_rx_queue_update_kobjects(dev, rxq, 0); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif return error; } static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid) { int error = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS if (ndev->queues_kset) { error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid); if (error) return error; } real_rx = ndev->real_num_rx_queues; #endif real_tx = ndev->real_num_tx_queues; error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid); if (error) return error; error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid); if (error) return error; return 0; } static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); netdev_lock_ops(dev); dev->real_num_rx_queues = 0; dev->real_num_tx_queues = 0; netdev_unlock_ops(dev); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif } static bool net_current_may_mount(void) { struct net *net = current->nsproxy->net_ns; return ns_capable(net->user_ns, CAP_SYS_ADMIN); } static void *net_grab_current_ns(void) { struct net *ns = current->nsproxy->net_ns; #ifdef CONFIG_NET_NS if (ns) refcount_inc(&ns->passive); #endif return ns; } static const void *net_initial_ns(void) { return &init_net; } static const void *net_netlink_ns(struct sock *sk) { return sock_net(sk); } const struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, .drop_ns = net_drop_ns, }; EXPORT_SYMBOL_GPL(net_ns_type_operations); static int netdev_uevent(const struct device *d, struct kobj_uevent_env *env) { const struct net_device *dev = to_net_dev(d); int retval; /* pass interface to uevent. */ retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) goto exit; /* pass ifindex to uevent. * ifindex is useful as it won't change (interface name may change) * and is what RtNetlink uses natively. */ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); exit: return retval; } /* * netdev_release -- destroy and free a dead device. * Called when last reference to device kobject is gone. */ static void netdev_release(struct device *d) { struct net_device *dev = to_net_dev(d); BUG_ON(dev->reg_state != NETREG_RELEASED); /* no need to wait for rcu grace period: * device is dead and about to be freed. */ kfree(rcu_access_pointer(dev->ifalias)); kvfree(dev); } static const void *net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d); return dev_net(dev); } static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) { const struct net_device *dev = to_net_dev(d); const struct net *net = dev_net(dev); net_ns_get_ownership(net, uid, gid); } static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, .get_ownership = net_get_ownership, }; #ifdef CONFIG_OF static int of_dev_node_match(struct device *dev, const void *data) { for (; dev; dev = dev->parent) { if (dev->of_node == data) return 1; } return 0; } /* * of_find_net_device_by_node - lookup the net device for the device node * @np: OF device node * * Looks up the net_device structure corresponding with the device node. * If successful, returns a pointer to the net_device with the embedded * struct device refcount incremented by one, or NULL on failure. The * refcount must be dropped when done with the net_device. */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; dev = class_find_device(&net_class, NULL, np, of_dev_node_match); if (!dev) return NULL; return to_net_dev(dev); } EXPORT_SYMBOL(of_find_net_device_by_node); #endif /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; if (!refcount_read(&dev_net(ndev)->ns.count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); device_del(dev); } /* Create sysfs entries for network device. */ int netdev_register_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; dev->platform_data = ndev; dev->groups = groups; dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ if (*groups) groups++; *groups++ = &netstat_group; if (wireless_group_needed(ndev)) *groups++ = &wireless_group; #endif /* CONFIG_SYSFS */ error = device_add(dev); if (error) return error; error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; } pm_runtime_set_memalloc_noio(dev, true); return error; } /* Change owner for sysfs entries when moving network devices across network * namespaces owned by different user namespaces. */ int netdev_change_owner(struct net_device *ndev, const struct net *net_old, const struct net *net_new) { kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; struct device *dev = &ndev->dev; int error; net_ns_get_ownership(net_old, &old_uid, &old_gid); net_ns_get_ownership(net_new, &new_uid, &new_gid); /* The network namespace was changed but the owning user namespace is * identical so there's no need to change the owner of sysfs entries. */ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid)) return 0; error = device_change_owner(dev, new_uid, new_gid); if (error) return error; error = queue_change_owner(ndev, new_uid, new_gid); if (error) return error; return 0; } int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns) { return class_create_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_create_file_ns); void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns) { class_remove_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_remove_file_ns); int __init netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); return class_register(&net_class); }
156 3698 2189 874 64 2356 1550 2523 2542 594 2361 2361 3 2184 3 2181 2180 2180 2179 2179 2180 1 2181 2179 2179 481 259 221 2834 260 2795 9 9 8 1 8 8 273 273 3 188 3 3 273 273 273 935 241 241 156 156 187 549 550 7 2244 2244 2243 1 2242 2243 2202 2204 2203 48 49 547 548 547 547 548 547 875 875 873 875 874 361 873 952 2435 2434 793 2363 1247 2175 4 2432 2432 2 2 2510 73 38 2445 1698 2447 2436 2184 792 546 537 942 2445 2204 2420 2447 4 2431 2437 772 459 1745 2429 5 1694 1 1740 455 2258 69 2194 2194 3 548 547 548 954 3 360 873 952 933 548 2 952 546 546 2418 15 177 30 1 2252 778 777 775 778 99 99 99 2 99 99 163 2 161 162 2531 2181 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include <linux/init.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/device_cgroup.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/part_stat.h> #include <linux/uaccess.h> #include <linux/stat.h> #include "../fs/internal.h" #include "blk.h" /* Should we allow writing to mounted block devices? */ static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); struct bdev_inode { struct block_device bdev; struct inode vfs_inode; }; static inline struct bdev_inode *BDEV_I(struct inode *inode) { return container_of(inode, struct bdev_inode, vfs_inode); } static inline struct inode *BD_INODE(struct block_device *bdev) { return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; } struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } EXPORT_SYMBOL(I_BDEV); struct block_device *file_bdev(struct file *bdev_file) { return I_BDEV(bdev_file->f_mapping->host); } EXPORT_SYMBOL(file_bdev); static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = BD_INODE(bdev); int ret; spin_lock(&inode->i_lock); while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); ret = write_inode_now(inode, true); if (ret) pr_warn_ratelimited( "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", bdev, ret); spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); } /* Kill _all_ buffers and pagecache , dirty or not.. */ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_mapping; if (mapping_empty(mapping)) return; invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); } /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_mapping; if (mapping->nrpages) { invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } } EXPORT_SYMBOL(invalidate_bdev); /* * Drop all buffers & page cache for given bdev range. This function bails * with error if bdev has other exclusive owner (such as filesystem). */ int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend) { /* * If we don't hold exclusive handle for the device, upgrade to it * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ if (!(mode & BLK_OPEN_EXCL)) { int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); if (err) goto invalidate; } truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; invalidate: /* * Someone else has handle exclusively open. Try invalidating instead. * The 'end' argument is inclusive so the rounding is safe. */ return invalidate_inode_pages2_range(bdev->bd_mapping, lstart >> PAGE_SHIFT, lend >> PAGE_SHIFT); } static void set_init_blocksize(struct block_device *bdev) { unsigned int bsize = bdev_logical_block_size(bdev); loff_t size = i_size_read(BD_INODE(bdev)); while (bsize < PAGE_SIZE) { if (size & bsize) break; bsize <<= 1; } BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, get_order(bsize)); } /** * bdev_validate_blocksize - check that this block size is acceptable * @bdev: blockdevice to check * @block_size: block size to check * * For block device users that do not use buffer heads or the block device * page cache, make sure that this block size can be used with the device. * * Return: On success zero is returned, negative error code on failure. */ int bdev_validate_blocksize(struct block_device *bdev, int block_size) { if (blk_validate_block_size(block_size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ if (block_size < bdev_logical_block_size(bdev)) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(bdev_validate_blocksize); int set_blocksize(struct file *file, int size) { struct inode *inode = file->f_mapping->host; struct block_device *bdev = I_BDEV(inode); int ret; ret = bdev_validate_blocksize(bdev, size); if (ret) return ret; if (!file->private_data) return -EINVAL; /* Don't change the size if it is same as current */ if (inode->i_blkbits != blksize_bits(size)) { /* * Flush and truncate the pagecache before we reconfigure the * mapping geometry because folio sizes are variable now. If a * reader has already allocated a folio whose size is smaller * than the new min_order but invokes readahead after the new * min_order becomes visible, readahead will think there are * "zero" blocks per folio and crash. Take the inode and * invalidation locks to avoid racing with * read/write/fallocate. */ inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); sync_blockdev(bdev); kill_bdev(bdev); inode->i_blkbits = blksize_bits(size); mapping_set_folio_min_order(inode->i_mapping, get_order(size)); kill_bdev(bdev); filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); } return 0; } EXPORT_SYMBOL(set_blocksize); int sb_set_blocksize(struct super_block *sb, int size) { if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE) return 0; if (set_blocksize(sb->s_bdev_file, size)) return 0; /* If we get here, we know size is validated */ sb->s_blocksize = size; sb->s_blocksize_bits = blksize_bits(size); return sb->s_blocksize; } EXPORT_SYMBOL(sb_set_blocksize); int sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) size = minsize; return sb_set_blocksize(sb, size); } EXPORT_SYMBOL(sb_min_blocksize); int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; return filemap_flush(bdev->bd_mapping); } EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { if (!bdev) return 0; return filemap_write_and_wait(bdev->bd_mapping); } EXPORT_SYMBOL(sync_blockdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) { return filemap_write_and_wait_range(bdev->bd_mapping, lstart, lend); } EXPORT_SYMBOL(sync_blockdev_range); /** * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple * freeze requests arrive simultaneously. It counts up in bdev_freeze() and * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. * * Return: On success zero is returned, negative error code on failure. */ int bdev_freeze(struct block_device *bdev) { int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { error = bdev->bd_holder_ops->freeze(bdev); lockdep_assert_not_held(&bdev->bd_holder_lock); } else { mutex_unlock(&bdev->bd_holder_lock); error = sync_blockdev(bdev); } if (error) atomic_dec(&bdev->bd_fsfreeze_count); mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_freeze); /** * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * * Unlocks the filesystem and marks it writeable again after bdev_freeze(). * * Return: On success zero is returned, negative error code on failure. */ int bdev_thaw(struct block_device *bdev) { int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); /* * If this returns < 0 it means that @bd_fsfreeze_count was * already 0 and no decrement was performed. */ nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); if (nr_freeze < 0) goto out; error = 0; if (nr_freeze > 0) goto out; mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { error = bdev->bd_holder_ops->thaw(bdev); lockdep_assert_not_held(&bdev->bd_holder_lock); } else { mutex_unlock(&bdev->bd_holder_lock); } if (error) atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs */ static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); static struct kmem_cache *bdev_cachep __ro_after_init; static struct inode *bdev_alloc_inode(struct super_block *sb) { struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); if (security_bdev_alloc(&ei->bdev)) { kmem_cache_free(bdev_cachep, ei); return NULL; } return &ei->vfs_inode; } static void bdev_free_inode(struct inode *inode) { struct block_device *bdev = I_BDEV(inode); free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); security_bdev_free(bdev); if (!bdev_is_partition(bdev)) { if (bdev->bd_disk && bdev->bd_disk->bdi) bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); } if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(bdev->bd_dev)); kmem_cache_free(bdev_cachep, BDEV_I(inode)); } static void init_once(void *data) { struct bdev_inode *ei = data; inode_init_once(&ei->vfs_inode); } static void bdev_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); } static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, .drop_inode = generic_delete_inode, .evict_inode = bdev_evict_inode, }; static int bd_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_CGROUPWB; ctx->ops = &bdev_sops; return 0; } static struct file_system_type bd_type = { .name = "bdev", .init_fs_context = bd_init_fs_context, .kill_sb = kill_anon_super, }; struct super_block *blockdev_superblock __ro_after_init; static struct vfsmount *blockdev_mnt __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); blockdev_mnt = kern_mount(&bd_type); if (IS_ERR(blockdev_mnt)) panic("Cannot create bdev pseudo-fs"); blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ } struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) { struct block_device *bdev; struct inode *inode; inode = new_inode(blockdev_superblock); if (!inode) return NULL; inode->i_mode = S_IFBLK; inode->i_rdev = 0; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); mutex_init(&bdev->bd_holder_lock); atomic_set(&bdev->__bd_flags, partno); bdev->bd_mapping = &inode->i_data; bdev->bd_queue = disk->queue; if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); return NULL; } bdev->bd_disk = disk; return bdev; } void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } void bdev_add(struct block_device *bdev, dev_t dev) { struct inode *inode = BD_INODE(bdev); if (bdev_stable_writes(bdev)) mapping_set_stable_writes(bdev->bd_mapping); bdev->bd_dev = dev; inode->i_rdev = dev; inode->i_ino = dev; insert_inode_hash(inode); } void bdev_unhash(struct block_device *bdev) { remove_inode_hash(BD_INODE(bdev)); } void bdev_drop(struct block_device *bdev) { iput(BD_INODE(bdev)); } long nr_blockdev_pages(void) { struct inode *inode; long ret = 0; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) ret += inode->i_mapping->nrpages; spin_unlock(&blockdev_superblock->s_inode_list_lock); return ret; } /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest * @holder: holder trying to claim @bdev * @hops: holder ops * * Test whether @bdev can be claimed by @holder. * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ static bool bd_may_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); lockdep_assert_held(&bdev_lock); if (bdev->bd_holder) { /* * The same holder can always re-claim. */ if (bdev->bd_holder == holder) { if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) return false; return true; } return false; } /* * If the whole devices holder is set to bd_may_claim, a partition on * the device is claimed, but not the whole device. */ if (whole != bdev && whole->bd_holder && whole->bd_holder != bd_may_claim) return false; return true; } /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @holder: holder trying to claim @bdev * @hops: holder ops. * * Claim @bdev. This function fails if @bdev is already claimed by another * holder and waits if another claiming is in progress. return, the caller * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); if (WARN_ON_ONCE(!holder)) return -EINVAL; retry: mutex_lock(&bdev_lock); /* if someone else claimed, fail */ if (!bd_may_claim(bdev, holder, hops)) { mutex_unlock(&bdev_lock); return -EBUSY; } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); goto retry; } /* yay, all mine */ whole->bd_claiming = holder; mutex_unlock(&bdev_lock); return 0; } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); /* tell others that we're done */ BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL; wake_up_var(&whole->bd_claiming); } /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev * @hops: block device holder operations * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ static void bd_finish_claiming(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); mutex_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, holder, hops)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder */ whole->bd_holders++; whole->bd_holder = bd_may_claim; bdev->bd_holders++; mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = holder; bdev->bd_holder_ops = hops; mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); } /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be * also used when exclusive open is not actually desired and we just needed * to block other exclusive openers for a while. */ void bd_abort_claiming(struct block_device *bdev, void *holder) { mutex_lock(&bdev_lock); bd_clear_claiming(bdev_whole(bdev), holder); mutex_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); bool unblock = false; /* * Release a claim on the device. The holder fields are protected with * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. */ mutex_lock(&bdev_lock); WARN_ON_ONCE(bdev->bd_holder != holder); WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--whole->bd_holders < 0); if (!bdev->bd_holders) { mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = NULL; bdev->bd_holder_ops = NULL; mutex_unlock(&bdev->bd_holder_lock); if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) unblock = true; } if (!whole->bd_holders) whole->bd_holder = NULL; mutex_unlock(&bdev_lock); /* * If this was the last claim, remove holder link and unblock evpoll if * it was a write holder. */ if (unblock) { disk_unblock_events(bdev->bd_disk); bdev_clear_flag(bdev, BD_WRITE_HOLDER); } } static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); bdev_write_inode(bdev); } static void blkdev_put_whole(struct block_device *bdev) { if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) bdev->bd_disk->fops->release(bdev->bd_disk); } static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret; if (disk->fops->open) { ret = disk->fops->open(disk, mode); if (ret) { /* avoid ghost partitions on a removed medium */ if (ret == -ENOMEDIUM && test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, true); return ret; } } if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); atomic_inc(&bdev->bd_openers); if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { /* * Only return scanning errors if we are called from contexts * that explicitly want them, e.g. the BLKRRPART ioctl. */ ret = bdev_disk_changed(disk, false); if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { blkdev_put_whole(bdev); return ret; } } return 0; } static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; int ret; ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) goto out_blkdev_put; if (!atomic_read(&part->bd_openers)) { disk->open_partitions++; set_init_blocksize(part); } atomic_inc(&part->bd_openers); return 0; out_blkdev_put: blkdev_put_whole(bdev_whole(part)); return ret; } int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) { int ret; ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) return ret; /* Blocking writes requires exclusive opener */ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) return -EINVAL; /* * We're using error pointers to indicate to ->release() when we * failed to open that block device. Also this doesn't make sense. */ if (WARN_ON_ONCE(IS_ERR(holder))) return -EINVAL; return 0; } static void blkdev_put_part(struct block_device *part) { struct block_device *whole = bdev_whole(part); if (atomic_dec_and_test(&part->bd_openers)) { blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; } blkdev_put_whole(whole); } struct block_device *blkdev_get_no_open(dev_t dev, bool autoload) { struct block_device *bdev; struct inode *inode; inode = ilookup(blockdev_superblock, dev); if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { blk_request_module(dev); inode = ilookup(blockdev_superblock, dev); if (inode) pr_warn_ratelimited( "block device autoloading is deprecated and will be removed.\n"); } if (!inode) return NULL; /* switch from the inode reference to a device mode one: */ bdev = &BDEV_I(inode)->bdev; if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) bdev = NULL; iput(inode); return bdev; } void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } static bool bdev_writes_blocked(struct block_device *bdev) { return bdev->bd_writers < 0; } static void bdev_block_writes(struct block_device *bdev) { bdev->bd_writers--; } static void bdev_unblock_writes(struct block_device *bdev) { bdev->bd_writers++; } static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) { if (bdev_allow_write_mounted) return true; /* Writes blocked? */ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) return false; if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) return false; return true; } static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) { if (bdev_allow_write_mounted) return; /* Claim exclusive or shared write access. */ if (mode & BLK_OPEN_RESTRICT_WRITES) bdev_block_writes(bdev); else if (mode & BLK_OPEN_WRITE) bdev->bd_writers++; } static inline bool bdev_unclaimed(const struct file *bdev_file) { return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); } static void bdev_yield_write_access(struct file *bdev_file) { struct block_device *bdev; if (bdev_allow_write_mounted) return; if (bdev_unclaimed(bdev_file)) return; bdev = file_bdev(bdev_file); if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) bdev_unblock_writes(bdev); else if (bdev_file->f_mode & FMODE_WRITE) bdev->bd_writers--; } /** * bdev_open - open a block device * @bdev: block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * @hops: holder operations * @bdev_file: file for the block device * * Open the block device. If @holder is not %NULL, the block device is opened * with exclusive access. Exclusive opens may nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: * zero on success, -errno on failure. */ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file) { bool unblock_events = true; struct gendisk *disk = bdev->bd_disk; int ret; if (holder) { mode |= BLK_OPEN_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) return ret; } else { if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) return -EIO; } disk_block_events(disk); mutex_lock(&disk->open_mutex); ret = -ENXIO; if (!disk_live(disk)) goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; ret = -EBUSY; if (!bdev_may_open(bdev, mode)) goto put_module; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; bdev_claim_write_access(bdev, mode); if (holder) { bd_finish_claiming(bdev, holder, hops); /* * Block event polling for write claims if requested. Any write * holder makes the write_holder state stick until all are * released. This is good enough and tracking individual * writeable reference is too fragile given the way @mode is * used in blkdev_get/put(). */ if ((mode & BLK_OPEN_WRITE) && !bdev_test_flag(bdev, BD_WRITE_HOLDER) && (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev_set_flag(bdev, BD_WRITE_HOLDER); unblock_events = false; } } mutex_unlock(&disk->open_mutex); if (unblock_events) disk_unblock_events(disk); bdev_file->f_flags |= O_LARGEFILE; bdev_file->f_mode |= FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; if (mode & BLK_OPEN_RESTRICT_WRITES) bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; bdev_file->f_mapping = bdev->bd_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); bdev_file->private_data = holder; return 0; put_module: module_put(disk->fops->owner); abort_claiming: if (holder) bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); return ret; } /* * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk * associated with the floppy driver where it has allowed ioctls if the * file was opened for writing, but does not allow reads or writes. * Make sure that this quirk is reflected in @f_flags. * * It can also happen if a block device is opened as O_RDWR | O_WRONLY. */ static unsigned blk_to_file_flags(blk_mode_t mode) { unsigned int flags = 0; if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == (BLK_OPEN_READ | BLK_OPEN_WRITE)) flags |= O_RDWR; else if (mode & BLK_OPEN_WRITE_IOCTL) flags |= O_RDWR | O_WRONLY; else if (mode & BLK_OPEN_WRITE) flags |= O_WRONLY; else if (mode & BLK_OPEN_READ) flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ else WARN_ON_ONCE(true); if (mode & BLK_OPEN_NDELAY) flags |= O_NDELAY; return flags; } struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct file *bdev_file; struct block_device *bdev; unsigned int flags; int ret; ret = bdev_permission(dev, mode, holder); if (ret) return ERR_PTR(ret); bdev = blkdev_get_no_open(dev, true); if (!bdev) return ERR_PTR(-ENXIO); flags = blk_to_file_flags(mode); bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); if (IS_ERR(bdev_file)) { blkdev_put_no_open(bdev); return bdev_file; } ihold(BD_INODE(bdev)); ret = bdev_open(bdev, mode, holder, hops, bdev_file); if (ret) { /* We failed to open the block device. Let ->release() know. */ bdev_file->private_data = ERR_PTR(ret); fput(bdev_file); return ERR_PTR(ret); } return bdev_file; } EXPORT_SYMBOL(bdev_file_open_by_dev); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct file *file; dev_t dev; int error; error = lookup_bdev(path, &dev); if (error) return ERR_PTR(error); file = bdev_file_open_by_dev(dev, mode, holder, hops); if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { if (bdev_read_only(file_bdev(file))) { fput(file); file = ERR_PTR(-EACCES); } } return file; } EXPORT_SYMBOL(bdev_file_open_by_path); static inline void bd_yield_claim(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); void *holder = bdev_file->private_data; lockdep_assert_held(&bdev->bd_disk->open_mutex); if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) return; if (!bdev_unclaimed(bdev_file)) bd_end_claim(bdev, holder); } void bdev_release(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); void *holder = bdev_file->private_data; struct gendisk *disk = bdev->bd_disk; /* We failed to open that block device. */ if (IS_ERR(holder)) goto put_no_open; /* * Sync early if it looks like we're the last one. If someone else * opens the block device between now and the decrement of bd_openers * then we did a sync that we didn't need to, but that's not the end * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ if (atomic_read(&bdev->bd_openers) == 1) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); bdev_yield_write_access(bdev_file); if (holder) bd_yield_claim(bdev_file); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE * event. This is to ensure detection of media removal commanded * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); if (bdev_is_partition(bdev)) blkdev_put_part(bdev); else blkdev_put_whole(bdev); mutex_unlock(&disk->open_mutex); module_put(disk->fops->owner); put_no_open: blkdev_put_no_open(bdev); } /** * bdev_fput - yield claim to the block device and put the file * @bdev_file: open block device * * Yield claim on the block device and put the file. Ensure that the * block device can be reclaimed before the file is closed which is a * deferred operation. */ void bdev_fput(struct file *bdev_file) { if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) return; if (bdev_file->private_data) { struct block_device *bdev = file_bdev(bdev_file); struct gendisk *disk = bdev->bd_disk; mutex_lock(&disk->open_mutex); bdev_yield_write_access(bdev_file); bd_yield_claim(bdev_file); /* * Tell release we already gave up our hold on the * device and if write restrictions are available that * we already gave up write access to the device. */ bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); mutex_unlock(&disk->open_mutex); } fput(bdev_file); } EXPORT_SYMBOL(bdev_fput); /** * lookup_bdev() - Look up a struct block_device by name. * @pathname: Name of the block device in the filesystem. * @dev: Pointer to the block device's dev_t, if found. * * Lookup the block device's dev_t at @pathname in the current * namespace if possible and return it in @dev. * * Context: May sleep. * Return: 0 if succeeded, negative errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { struct inode *inode; struct path path; int error; if (!pathname || !*pathname) return -EINVAL; error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) return error; inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto out_path_put; error = -EACCES; if (!may_open_dev(&path)) goto out_path_put; *dev = inode->i_rdev; error = 0; out_path_put: path_put(&path); return error; } EXPORT_SYMBOL(lookup_bdev); /** * bdev_mark_dead - mark a block device as dead * @bdev: block device to operate on * @surprise: indicate a surprise removal * * Tell the file system that this devices or media is dead. If @surprise is set * to %true the device or media is already gone, if not we are preparing for an * orderly removal. * * This calls into the file system, which then typicall syncs out all dirty data * and writes back inodes and then invalidates any cached data in the inodes on * the file system. In addition we also invalidate the block device mapping. */ void bdev_mark_dead(struct block_device *bdev, bool surprise) { mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) bdev->bd_holder_ops->mark_dead(bdev, surprise); else { mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); } invalidate_bdev(bdev); } /* * New drivers should not use this directly. There are some drivers however * that needs this for historical reasons. For example, the DASD driver has * historically had a shutdown to offline mode that doesn't actually remove the * gendisk that otherwise looks a lot like a safe device removal. */ EXPORT_SYMBOL_GPL(bdev_mark_dead); void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || mapping->nrpages == 0) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); old_inode = inode; bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); if (!atomic_read(&bdev->bd_openers)) { ; /* skip */ } else if (wait) { /* * We keep the error status of individual mapping so * that applications can catch the writeback error using * fsync(2). See filemap_fdatawait_keep_errors() for * details. */ filemap_fdatawait_keep_errors(inode->i_mapping); } else { filemap_fdatawrite(inode->i_mapping); } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } /* * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. */ void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) { struct block_device *bdev; /* * Note that d_backing_inode() returns the block device node inode, not * the block device's internal inode. Therefore it is *not* valid to * use I_BDEV() here; the block device has to be looked up by i_rdev * instead. */ bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false); if (!bdev) return; if (request_mask & STATX_DIOALIGN) { stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); stat->result_mask |= STATX_DIOALIGN; } if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { struct request_queue *bd_queue = bdev->bd_queue; generic_fill_statx_atomic_writes(stat, queue_atomic_write_unit_min_bytes(bd_queue), queue_atomic_write_unit_max_bytes(bd_queue), 0); } stat->blksize = bdev_io_min(bdev); blkdev_put_no_open(bdev); } bool disk_live(struct gendisk *disk) { return !inode_unhashed(BD_INODE(disk->part0)); } EXPORT_SYMBOL_GPL(disk_live); unsigned int block_size(struct block_device *bdev) { return 1 << BD_INODE(bdev)->i_blkbits; } EXPORT_SYMBOL_GPL(block_size); static int __init setup_bdev_allow_write_mounted(char *str) { if (kstrtobool(str, &bdev_allow_write_mounted)) pr_warn("Invalid option string for bdev_allow_write_mounted:" " '%s'\n", str); return 1; } __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
5 10 861 860 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel Implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (C) 1999-2001 Cisco, Motorola * * This file is part of the SCTP kernel implementation * * These are the definitions needed for the command object. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Ardelle Fan <ardelle.fan@intel.com> * Sridhar Samudrala <sri@us.ibm.com> */ #ifndef __net_sctp_command_h__ #define __net_sctp_command_h__ #include <net/sctp/constants.h> #include <net/sctp/structs.h> enum sctp_verb { SCTP_CMD_NOP = 0, /* Do nothing. */ SCTP_CMD_NEW_ASOC, /* Register a new association. */ SCTP_CMD_DELETE_TCB, /* Delete the current association. */ SCTP_CMD_NEW_STATE, /* Enter a new state. */ SCTP_CMD_REPORT_TSN, /* Record the arrival of a TSN. */ SCTP_CMD_GEN_SACK, /* Send a Selective ACK (maybe). */ SCTP_CMD_PROCESS_SACK, /* Process an inbound SACK. */ SCTP_CMD_GEN_INIT_ACK, /* Generate an INIT ACK chunk. */ SCTP_CMD_PEER_INIT, /* Process a INIT from the peer. */ SCTP_CMD_GEN_COOKIE_ECHO, /* Generate a COOKIE ECHO chunk. */ SCTP_CMD_CHUNK_ULP, /* Send a chunk to the sockets layer. */ SCTP_CMD_EVENT_ULP, /* Send a notification to the sockets layer. */ SCTP_CMD_REPLY, /* Send a chunk to our peer. */ SCTP_CMD_SEND_PKT, /* Send a full packet to our peer. */ SCTP_CMD_RETRAN, /* Mark a transport for retransmission. */ SCTP_CMD_ECN_CE, /* Do delayed CE processing. */ SCTP_CMD_ECN_ECNE, /* Do delayed ECNE processing. */ SCTP_CMD_ECN_CWR, /* Do delayed CWR processing. */ SCTP_CMD_TIMER_START, /* Start a timer. */ SCTP_CMD_TIMER_START_ONCE, /* Start a timer once */ SCTP_CMD_TIMER_RESTART, /* Restart a timer. */ SCTP_CMD_TIMER_STOP, /* Stop a timer. */ SCTP_CMD_INIT_CHOOSE_TRANSPORT, /* Choose transport for an INIT. */ SCTP_CMD_INIT_COUNTER_RESET, /* Reset init counter. */ SCTP_CMD_INIT_COUNTER_INC, /* Increment init counter. */ SCTP_CMD_INIT_RESTART, /* High level, do init timer work. */ SCTP_CMD_COOKIEECHO_RESTART, /* High level, do cookie-echo timer work. */ SCTP_CMD_INIT_FAILED, /* High level, do init failure work. */ SCTP_CMD_REPORT_DUP, /* Report a duplicate TSN. */ SCTP_CMD_STRIKE, /* Mark a strike against a transport. */ SCTP_CMD_HB_TIMERS_START, /* Start the heartbeat timers. */ SCTP_CMD_HB_TIMER_UPDATE, /* Update a heartbeat timers. */ SCTP_CMD_HB_TIMERS_STOP, /* Stop the heartbeat timers. */ SCTP_CMD_PROBE_TIMER_UPDATE, /* Update a probe timer. */ SCTP_CMD_TRANSPORT_HB_SENT, /* Reset the status of a transport. */ SCTP_CMD_TRANSPORT_IDLE, /* Do manipulations on idle transport */ SCTP_CMD_TRANSPORT_ON, /* Mark the transport as active. */ SCTP_CMD_REPORT_ERROR, /* Pass this error back out of the sm. */ SCTP_CMD_REPORT_BAD_TAG, /* Verification tags didn't match. */ SCTP_CMD_PROCESS_CTSN, /* Sideeffect from shutdown. */ SCTP_CMD_ASSOC_FAILED, /* Handle association failure. */ SCTP_CMD_DISCARD_PACKET, /* Discard the whole packet. */ SCTP_CMD_GEN_SHUTDOWN, /* Generate a SHUTDOWN chunk. */ SCTP_CMD_PURGE_OUTQUEUE, /* Purge all data waiting to be sent. */ SCTP_CMD_SETUP_T2, /* Hi-level, setup T2-shutdown parms. */ SCTP_CMD_RTO_PENDING, /* Set transport's rto_pending. */ SCTP_CMD_PART_DELIVER, /* Partial data delivery considerations. */ SCTP_CMD_RENEGE, /* Renege data on an association. */ SCTP_CMD_SETUP_T4, /* ADDIP, setup T4 RTO timer parms. */ SCTP_CMD_PROCESS_OPERR, /* Process an ERROR chunk. */ SCTP_CMD_REPORT_FWDTSN, /* Report new cumulative TSN Ack. */ SCTP_CMD_PROCESS_FWDTSN, /* Skips were reported, so process further. */ SCTP_CMD_CLEAR_INIT_TAG, /* Clears association peer's inittag. */ SCTP_CMD_DEL_NON_PRIMARY, /* Removes non-primary peer transports. */ SCTP_CMD_T3_RTX_TIMERS_STOP, /* Stops T3-rtx pending timers */ SCTP_CMD_FORCE_PRIM_RETRAN, /* Forces retrans. over primary path. */ SCTP_CMD_SET_SK_ERR, /* Set sk_err */ SCTP_CMD_ASSOC_CHANGE, /* generate and send assoc_change event */ SCTP_CMD_ADAPTATION_IND, /* generate and send adaptation event */ SCTP_CMD_PEER_NO_AUTH, /* generate and send authentication event */ SCTP_CMD_ASSOC_SHKEY, /* generate the association shared keys */ SCTP_CMD_T1_RETRAN, /* Mark for retransmission after T1 timeout */ SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */ SCTP_CMD_SEND_MSG, /* Send the whole use message */ SCTP_CMD_PURGE_ASCONF_QUEUE, /* Purge all asconf queues.*/ SCTP_CMD_SET_ASOC, /* Restore association context */ SCTP_CMD_LAST }; /* How many commands can you put in an struct sctp_cmd_seq? * This is a rather arbitrary number, ideally derived from a careful * analysis of the state functions, but in reality just taken from * thin air in the hopes othat we don't trigger a kernel panic. */ #define SCTP_MAX_NUM_COMMANDS 20 union sctp_arg { void *zero_all; /* Set to NULL to clear the entire union */ __s32 i32; __u32 u32; __be32 be32; __u16 u16; __u8 u8; int error; __be16 err; enum sctp_state state; enum sctp_event_timeout to; struct sctp_chunk *chunk; struct sctp_association *asoc; struct sctp_transport *transport; struct sctp_bind_addr *bp; struct sctp_init_chunk *init; struct sctp_ulpevent *ulpevent; struct sctp_packet *packet; struct sctp_sackhdr *sackh; struct sctp_datamsg *msg; }; /* We are simulating ML type constructors here. * * SCTP_ARG_CONSTRUCTOR(NAME, TYPE, ELT) builds a function called * SCTP_NAME() which takes an argument of type TYPE and returns an * union sctp_arg. It does this by inserting the sole argument into * the ELT union element of a local union sctp_arg. * * E.g., SCTP_ARG_CONSTRUCTOR(I32, __s32, i32) builds SCTP_I32(arg), * which takes an __s32 and returns a union sctp_arg containing the * __s32. So, after foo = SCTP_I32(arg), foo.i32 == arg. */ #define SCTP_ARG_CONSTRUCTOR(name, type, elt) \ static inline union sctp_arg \ SCTP_## name (type arg) \ { union sctp_arg retval;\ retval.zero_all = NULL;\ retval.elt = arg;\ return retval;\ } SCTP_ARG_CONSTRUCTOR(I32, __s32, i32) SCTP_ARG_CONSTRUCTOR(U32, __u32, u32) SCTP_ARG_CONSTRUCTOR(BE32, __be32, be32) SCTP_ARG_CONSTRUCTOR(U16, __u16, u16) SCTP_ARG_CONSTRUCTOR(U8, __u8, u8) SCTP_ARG_CONSTRUCTOR(ERROR, int, error) SCTP_ARG_CONSTRUCTOR(PERR, __be16, err) /* protocol error */ SCTP_ARG_CONSTRUCTOR(STATE, enum sctp_state, state) SCTP_ARG_CONSTRUCTOR(TO, enum sctp_event_timeout, to) SCTP_ARG_CONSTRUCTOR(CHUNK, struct sctp_chunk *, chunk) SCTP_ARG_CONSTRUCTOR(ASOC, struct sctp_association *, asoc) SCTP_ARG_CONSTRUCTOR(TRANSPORT, struct sctp_transport *, transport) SCTP_ARG_CONSTRUCTOR(BA, struct sctp_bind_addr *, bp) SCTP_ARG_CONSTRUCTOR(PEER_INIT, struct sctp_init_chunk *, init) SCTP_ARG_CONSTRUCTOR(ULPEVENT, struct sctp_ulpevent *, ulpevent) SCTP_ARG_CONSTRUCTOR(PACKET, struct sctp_packet *, packet) SCTP_ARG_CONSTRUCTOR(SACKH, struct sctp_sackhdr *, sackh) SCTP_ARG_CONSTRUCTOR(DATAMSG, struct sctp_datamsg *, msg) static inline union sctp_arg SCTP_FORCE(void) { return SCTP_I32(1); } static inline union sctp_arg SCTP_NOFORCE(void) { return SCTP_I32(0); } static inline union sctp_arg SCTP_NULL(void) { union sctp_arg retval; retval.zero_all = NULL; return retval; } struct sctp_cmd { union sctp_arg obj; enum sctp_verb verb; }; struct sctp_cmd_seq { struct sctp_cmd cmds[SCTP_MAX_NUM_COMMANDS]; struct sctp_cmd *last_used_slot; struct sctp_cmd *next_cmd; }; /* Initialize a block of memory as a command sequence. * Return 0 if the initialization fails. */ static inline int sctp_init_cmd_seq(struct sctp_cmd_seq *seq) { /* cmds[] is filled backwards to simplify the overflow BUG() check */ seq->last_used_slot = seq->cmds + SCTP_MAX_NUM_COMMANDS; seq->next_cmd = seq->last_used_slot; return 1; /* We always succeed. */ } /* Add a command to an struct sctp_cmd_seq. * * Use the SCTP_* constructors defined by SCTP_ARG_CONSTRUCTOR() above * to wrap data which goes in the obj argument. */ static inline void sctp_add_cmd_sf(struct sctp_cmd_seq *seq, enum sctp_verb verb, union sctp_arg obj) { struct sctp_cmd *cmd = seq->last_used_slot - 1; BUG_ON(cmd < seq->cmds); cmd->verb = verb; cmd->obj = obj; seq->last_used_slot = cmd; } /* Return the next command structure in an sctp_cmd_seq. * Return NULL at the end of the sequence. */ static inline struct sctp_cmd *sctp_next_cmd(struct sctp_cmd_seq *seq) { if (seq->next_cmd <= seq->last_used_slot) return NULL; return --seq->next_cmd; } #endif /* __net_sctp_command_h__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ /* * Type definitions for the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. */ #ifndef _SS_MLS_TYPES_H_ #define _SS_MLS_TYPES_H_ #include "security.h" #include "ebitmap.h" struct mls_level { u32 sens; /* sensitivity */ struct ebitmap cat; /* category set */ }; struct mls_range { struct mls_level level[2]; /* low == level[0], high == level[1] */ }; static inline int mls_level_eq(const struct mls_level *l1, const struct mls_level *l2) { return ((l1->sens == l2->sens) && ebitmap_equal(&l1->cat, &l2->cat)); } static inline int mls_level_dom(const struct mls_level *l1, const struct mls_level *l2) { return ((l1->sens >= l2->sens) && ebitmap_contains(&l1->cat, &l2->cat, 0)); } #define mls_level_incomp(l1, l2) \ (!mls_level_dom((l1), (l2)) && !mls_level_dom((l2), (l1))) #define mls_level_between(l1, l2, l3) \ (mls_level_dom((l1), (l2)) && mls_level_dom((l3), (l1))) #define mls_range_contains(r1, r2) \ (mls_level_dom(&(r2).level[0], &(r1).level[0]) && \ mls_level_dom(&(r1).level[1], &(r2).level[1])) #endif /* _SS_MLS_TYPES_H_ */
2 1 1 1 14 2 1 139 1 13 2 22 47 10 10 29 1 57 58 58 57 4 79 79 47 32 62 21 2 103 71 17 156 64 1 79 143 176 176 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2008 Red Hat, Inc. All rights reserved. * Copyright 2008 Ian Kent <raven@themaw.net> */ #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/compat.h> #include <linux/fdtable.h> #include <linux/magic.h> #include <linux/nospec.h> #include "autofs_i.h" /* * This module implements an interface for routing autofs ioctl control * commands via a miscellaneous device file. * * The alternate interface is needed because we need to be able open * an ioctl file descriptor on an autofs mount that may be covered by * another mount. This situation arises when starting automount(8) * or other user space daemon which uses direct mounts or offset * mounts (used for autofs lazy mount/umount of nested mount trees), * which have been left busy at service shutdown. */ typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, struct autofs_dev_ioctl *); static int check_name(const char *name) { if (!strchr(name, '/')) return -EINVAL; return 0; } /* * Check a string doesn't overrun the chunk of * memory we copied from user land. */ static int invalid_str(char *str, size_t size) { if (memchr(str, 0, size)) return 0; return -EINVAL; } /* * Check that the user compiled against correct version of autofs * misc device code. * * As well as checking the version compatibility this always copies * the kernel interface version out. */ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) { int err = 0; if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { pr_warn("ioctl control interface version mismatch: " "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); err = -EINVAL; } /* Fill in the kernel version. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return err; } /* * Copy parameter control struct, including a possible path allocated * at the end of the struct. */ static struct autofs_dev_ioctl * copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) return ERR_PTR(-EFAULT); if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) return ERR_PTR(-EINVAL); if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) return ERR_PTR(-ENAMETOOLONG); res = memdup_user(in, tmp.size); if (!IS_ERR(res)) res->size = tmp.size; return res; } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) { kfree(param); } /* * Check sanity of parameter control fields and if a path is present * check that it is terminated and contains at least one "/". */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { unsigned int inr = _IOC_NR(cmd); int err; err = check_dev_ioctl_version(cmd, param); if (err) { pr_warn("invalid device control module version " "supplied for cmd(0x%08x)\n", cmd); goto out; } if (param->size > AUTOFS_DEV_IOCTL_SIZE) { err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); if (err) { pr_warn( "path string terminator missing for cmd(0x%08x)\n", cmd); goto out; } /* Setting the per-dentry expire timeout requires a trailing * path component, ie. no '/', so invert the logic of the * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. */ err = check_name(param->path); if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", cmd); goto out; } } else { if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { err = -EINVAL; goto out; } } err = 0; out: return err; } /* Return autofs dev ioctl version */ static int autofs_dev_ioctl_version(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { /* This should have already been set. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return 0; } /* Return autofs module protocol version */ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protover.version = sbi->version; return 0; } /* Return autofs module protocol sub version */ static int autofs_dev_ioctl_protosubver(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protosubver.sub_version = sbi->sub_version; return 0; } /* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, int test(const struct path *path, void *data), void *data) { struct path path; int err; err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path); if (err) return err; err = -ENOENT; while (path.dentry == path.mnt->mnt_root) { if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); *res = path; err = 0; break; } } if (!follow_up(&path)) break; } path_put(&path); return err; } static int test_by_dev(const struct path *path, void *p) { return path->dentry->d_sb->s_dev == *(dev_t *)p; } static int test_by_type(const struct path *path, void *p) { struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } /* * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). */ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { int err, fd; fd = get_unused_fd_flags(O_CLOEXEC); if (likely(fd >= 0)) { struct file *filp; struct path path; err = find_autofs_mount(name, &path, test_by_dev, &devid); if (err) goto out; filp = dentry_open(&path, O_RDONLY, current_cred()); path_put(&path); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } fd_install(fd, filp); } return fd; out: put_unused_fd(fd); return err; } /* Open a file descriptor on an autofs mount point */ static int autofs_dev_ioctl_openmount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { const char *path; dev_t devid; int err, fd; /* param->path has been checked in validate_dev_ioctl() */ if (!param->openmount.devid) return -EINVAL; param->ioctlfd = -1; path = param->path; devid = new_decode_dev(param->openmount.devid); err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); if (unlikely(fd < 0)) { err = fd; goto out; } param->ioctlfd = fd; out: return err; } /* Close file descriptor allocated above (user can also use close(2)). */ static int autofs_dev_ioctl_closemount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { return close_fd(param->ioctlfd); } /* * Send "ready" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_ready(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; return autofs_wait_release(sbi, token, 0); } /* * Send "fail" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_fail(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; int status; token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs_wait_release(sbi, token, status); } /* * Set the pipe fd for kernel communication to the daemon. * * Normally this is set at mount using an option but if we * are reconnecting to a busy mount then we need to use this * to tell the autofs mount about the new kernel pipe fd. In * order to protect mounts against incorrectly setting the * pipefd we also require that the autofs mount be catatonic. * * This also sets the process group id used to identify the * controlling process (eg. the owning automount(8) daemon). */ static int autofs_dev_ioctl_setpipefd(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { int pipefd; int err = 0; struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) { mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { struct file *pipe; new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { pr_warn("not allowed to change PID namespace\n"); err = -EINVAL; goto out; } pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; } if (autofs_prepare_pipe(pipe) < 0) { err = -EPIPE; fput(pipe); goto out; } swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->flags &= ~AUTOFS_SBI_CATATONIC; } out: put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } /* * Make the autofs mount point catatonic, no longer responsive to * mount requests. Also closes the kernel pipe file descriptor. */ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_catatonic_mode(sbi); return 0; } /* * Set the autofs mount expire timeout. * * There are two places an expire timeout can be set, in the autofs * super block info. (this is all that's needed for direct and offset * mounts because there's a distinct mount corresponding to each of * these) and per-dentry within within the dentry info. If a per-dentry * timeout is set it will override the expire timeout set in the parent * autofs super block info. * * If setting the autofs super block expire timeout the autofs_dev_ioctl * size field will be equal to the autofs_dev_ioctl structure size. If * setting the per-dentry expire timeout the mount point name is passed * in the autofs_dev_ioctl path field and the size field updated to * reflect this. * * Setting the autofs mount expire timeout sets the timeout in the super * block info. struct. Setting the per-dentry timeout does a little more. * If the timeout is equal to -1 the per-dentry timeout (and flag) is * cleared which reverts to using the super block timeout, otherwise if * timeout is 0 the timeout is set to this value and the flag is left * set which disables expiration for the mount point, lastly the flag * and the timeout are set enabling the dentry to use this timeout. */ static int autofs_dev_ioctl_timeout(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { unsigned long timeout = param->timeout.timeout; /* If setting the expire timeout for an individual indirect * mount point dentry the mount trailing component path is * placed in param->path and param->size adjusted to account * for it otherwise param->size it is set to the structure * size. */ if (param->size == AUTOFS_DEV_IOCTL_SIZE) { param->timeout.timeout = sbi->exp_timeout / HZ; sbi->exp_timeout = timeout * HZ; } else { struct dentry *base = fp->f_path.dentry; int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; struct dentry *dentry; struct autofs_info *ino; if (!autofs_type_indirect(sbi->type)) return -EINVAL; /* An expire timeout greater than the superblock timeout * could be a problem at shutdown but the super block * timeout itself can change so all we can really do is * warn the user. */ if (timeout >= sbi->exp_timeout) pr_warn("per-mount expire timeout is greater than " "the parent autofs mount timeout which could " "prevent shutdown\n"); dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len), base); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); if (!ino) { dput(dentry); return -ENOENT; } if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET) param->timeout.timeout = ino->exp_timeout / HZ; else param->timeout.timeout = sbi->exp_timeout / HZ; if (timeout == -1) { /* Revert to using the super block timeout */ ino->flags &= ~AUTOFS_INF_EXPIRE_SET; ino->exp_timeout = 0; } else { /* Set the dentry expire flag and timeout. * * If timeout is 0 it will prevent the expire * of this particular automount. */ ino->flags |= AUTOFS_INF_EXPIRE_SET; ino->exp_timeout = timeout * HZ; } dput(dentry); } return 0; } /* * Return the uid and gid of the last request for the mount * * When reconstructing an autofs mount tree with active mounts * we need to re-connect to mounts that may have used the original * process uid and gid (or string variations of them) for mount * lookups within the map entry. */ static int autofs_dev_ioctl_requester(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct autofs_info *ino; struct path path; dev_t devid; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ devid = sbi->sb->s_dev; param->requester.uid = param->requester.gid = -1; err = find_autofs_mount(param->path, &path, test_by_dev, &devid); if (err) goto out; ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); out: return err; } /* * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more that can be done. */ static int autofs_dev_ioctl_expire(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct vfsmount *mnt; int how; how = param->expire.how; mnt = fp->f_path.mnt; return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ static int autofs_dev_ioctl_askumount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->askumount.may_umount = 0; if (may_umount(fp->f_path.mnt)) param->askumount.may_umount = 1; return 0; } /* * Check if the given path is a mountpoint. * * If we are supplied with the file descriptor of an autofs * mount we're looking for a specific mount. In this case * the path is considered a mountpoint if it is itself a * mountpoint or contains a mount, such as a multi-mount * without a root mount. In this case we return 1 if the * path is a mount point and the super magic of the covering * mount if there is one or 0 if it isn't a mountpoint. * * If we aren't supplied with a file descriptor then we * lookup the path and check if it is the root of a mount. * If a type is given we are looking for a particular autofs * mount and if we don't find a match we return fail. If the * located path is the root of a mount we return 1 along with * the super magic of the mount or 0 otherwise. * * In both cases the device number (as returned by * new_encode_dev()) is also returned. */ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct path path; const char *name; unsigned int type; unsigned int devid, magic; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ name = param->path; type = param->ismountpoint.in.type; param->ismountpoint.out.devid = devid = 0; param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT, &path); else err = find_autofs_mount(name, &path, test_by_type, &type); if (err) goto out; devid = new_encode_dev(path.dentry->d_sb->s_dev); err = 0; if (path.mnt->mnt_root == path.dentry) { err = 1; magic = path.dentry->d_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; err = find_autofs_mount(name, &path, test_by_dev, &dev); if (err) goto out; devid = new_encode_dev(dev); err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; } param->ismountpoint.out.devid = devid; param->ismountpoint.out.magic = magic; path_put(&path); out: return err; } /* * Our range of ioctl numbers isn't 0 based so we need to shift * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table * lookup. */ #define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, autofs_dev_ioctl_openmount, autofs_dev_ioctl_closemount, autofs_dev_ioctl_ready, autofs_dev_ioctl_fail, autofs_dev_ioctl_setpipefd, autofs_dev_ioctl_catatonic, autofs_dev_ioctl_timeout, autofs_dev_ioctl_requester, autofs_dev_ioctl_expire, autofs_dev_ioctl_askumount, autofs_dev_ioctl_ismountpoint, }; unsigned int idx = cmd_idx(cmd); if (idx >= ARRAY_SIZE(_ioctls)) return NULL; idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); return _ioctls[idx]; } /* ioctl dispatcher */ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) { struct autofs_dev_ioctl *param; struct file *fp; struct autofs_sb_info *sbi; unsigned int cmd_first, cmd; ioctl_fn fn = NULL; int err = 0; cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { return -ENOTTY; } /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && !capable(CAP_SYS_ADMIN)) return -EPERM; /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) return PTR_ERR(param); err = validate_dev_ioctl(command, param); if (err) goto out; fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); err = -ENOTTY; goto out; } fp = NULL; sbi = NULL; /* * For obvious reasons the openmount can't have a file * descriptor yet. We don't take a reference to the * file during close to allow for immediate release, * and the same for retrieving ioctl version. */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { struct super_block *sb; fp = fget(param->ioctlfd); if (!fp) { if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) goto cont; err = -EBADF; goto out; } sb = file_inode(fp)->i_sb; if (sb->s_type != &autofs_fs_type) { err = -EINVAL; fput(fp); goto out; } sbi = autofs_sbi(sb); /* * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); goto out; } } cont: err = fn(fp, sbi, param); if (fp) fput(fp); if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) err = -EFAULT; out: free_dev_ioctl(param); return err; } static long autofs_dev_ioctl(struct file *file, unsigned int command, unsigned long u) { int err; err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); return (long) err; } #ifdef CONFIG_COMPAT static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, unsigned long u) { return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); } #else #define autofs_dev_ioctl_compat NULL #endif static const struct file_operations _dev_ioctl_fops = { .unlocked_ioctl = autofs_dev_ioctl, .compat_ioctl = autofs_dev_ioctl_compat, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops, .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); MODULE_ALIAS("devname:autofs"); /* Register/deregister misc character device */ int __init autofs_dev_ioctl_init(void) { int r; r = misc_register(&_autofs_dev_ioctl_misc); if (r) { pr_err("misc_register failed for control device\n"); return r; } return 0; } void autofs_dev_ioctl_exit(void) { misc_deregister(&_autofs_dev_ioctl_misc); }
173 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SHARED_IO_H #define _ASM_X86_SHARED_IO_H #include <linux/types.h> #define BUILDIO(bwl, bw, type) \ static __always_inline void __out##bwl(type value, u16 port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ static __always_inline type __in##bwl(u16 port) \ { \ type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } BUILDIO(b, b, u8) BUILDIO(w, w, u16) BUILDIO(l, , u32) #undef BUILDIO #define inb __inb #define inw __inw #define inl __inl #define outb __outb #define outw __outw #define outl __outl #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ELF_H #define _ASM_X86_ELF_H /* * ELF register definitions.. */ #include <linux/thread_info.h> #include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> #include <asm/fsgsbase.h> typedef unsigned long elf_greg_t; #define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t)) typedef elf_greg_t elf_gregset_t[ELF_NGREG]; typedef struct user_i387_struct elf_fpregset_t; #ifdef __i386__ #define R_386_NONE 0 #define R_386_32 1 #define R_386_PC32 2 #define R_386_GOT32 3 #define R_386_PLT32 4 #define R_386_COPY 5 #define R_386_GLOB_DAT 6 #define R_386_JMP_SLOT 7 #define R_386_RELATIVE 8 #define R_386_GOTOFF 9 #define R_386_GOTPC 10 #define R_386_NUM 11 /* * These are used to set parameters in the core dumps. */ #define ELF_CLASS ELFCLASS32 #define ELF_DATA ELFDATA2LSB #define ELF_ARCH EM_386 #else /* x86-64 relocation types */ #define R_X86_64_NONE 0 /* No reloc */ #define R_X86_64_64 1 /* Direct 64 bit */ #define R_X86_64_PC32 2 /* PC relative 32 bit signed */ #define R_X86_64_GOT32 3 /* 32 bit GOT entry */ #define R_X86_64_PLT32 4 /* 32 bit PLT address */ #define R_X86_64_COPY 5 /* Copy symbol at runtime */ #define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ #define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ #define R_X86_64_RELATIVE 8 /* Adjust by program base */ #define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */ #define R_X86_64_GOTPCRELX 41 #define R_X86_64_REX_GOTPCRELX 42 #define R_X86_64_32 10 /* Direct 32 bit zero extended */ #define R_X86_64_32S 11 /* Direct 32 bit sign extended */ #define R_X86_64_16 12 /* Direct 16 bit zero extended */ #define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ #define R_X86_64_8 14 /* Direct 8 bit sign extended */ #define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ #define R_X86_64_PC64 24 /* Place relative 64-bit signed */ /* * These are used to set parameters in the core dumps. */ #define ELF_CLASS ELFCLASS64 #define ELF_DATA ELFDATA2LSB #define ELF_ARCH EM_X86_64 #endif #include <asm/vdso.h> extern unsigned int vdso64_enabled; extern unsigned int vdso32_enabled; /* * This is used to ensure we don't load something for the wrong architecture. */ #define elf_check_arch_ia32(x) \ (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) #include <asm/processor.h> #ifdef CONFIG_X86_32 #include <asm/desc.h> #define elf_check_arch(x) elf_check_arch_ia32(x) /* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx contains a pointer to a function which might be registered using `atexit'. This provides a mean for the dynamic linker to call DT_FINI functions for shared libraries that have been loaded before the code runs. A value of 0 tells we have no such handler. We might as well make sure everything else is cleared too (except for %esp), just to make things more deterministic. */ #define ELF_PLAT_INIT(_r, load_addr) \ do { \ _r->bx = 0; _r->cx = 0; _r->dx = 0; \ _r->si = 0; _r->di = 0; _r->bp = 0; \ _r->ax = 0; \ } while (0) /* * regs is struct pt_regs, pr_reg is elf_gregset_t (which is * now struct_user_regs, they are different) */ #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ pr_reg[0] = regs->bx; \ pr_reg[1] = regs->cx; \ pr_reg[2] = regs->dx; \ pr_reg[3] = regs->si; \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ pr_reg[7] = regs->ds; \ pr_reg[8] = regs->es; \ pr_reg[9] = regs->fs; \ savesegment(gs, pr_reg[10]); \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ pr_reg[16] = regs->ss; \ } while (0); #define ELF_PLATFORM (utsname()->machine) #define set_personality_64bit() do { } while (0) #else /* CONFIG_X86_32 */ /* * This is used to ensure we don't load something for the wrong architecture. */ #define elf_check_arch(x) \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { /* ax gets execve's return value. */ /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0; regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; t->fsbase = t->gsbase = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; } #define ELF_PLAT_INIT(_r, load_addr) \ elf_common_init(&current->thread, _r, 0) #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(&current->thread, regs, __USER_DS) void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32); #define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \ compat_start_thread(regs, new_ip, new_sp, ex->e_machine == EM_X86_64) void set_personality_ia32(bool); #define COMPAT_SET_PERSONALITY(ex) \ set_personality_ia32((ex).e_machine == EM_X86_64) #define COMPAT_ELF_PLATFORM ("i686") /* * regs is struct pt_regs, pr_reg is elf_gregset_t (which is * now struct_user_regs, they are different). Assumes current is the process * getting dumped. */ #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ (pr_reg)[2] = (regs)->r13; \ (pr_reg)[3] = (regs)->r12; \ (pr_reg)[4] = (regs)->bp; \ (pr_reg)[5] = (regs)->bx; \ (pr_reg)[6] = (regs)->r11; \ (pr_reg)[7] = (regs)->r10; \ (pr_reg)[8] = (regs)->r9; \ (pr_reg)[9] = (regs)->r8; \ (pr_reg)[10] = (regs)->ax; \ (pr_reg)[11] = (regs)->cx; \ (pr_reg)[12] = (regs)->dx; \ (pr_reg)[13] = (regs)->si; \ (pr_reg)[14] = (regs)->di; \ (pr_reg)[15] = (regs)->orig_ax; \ (pr_reg)[16] = (regs)->ip; \ (pr_reg)[17] = (regs)->cs; \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ (pr_reg)[21] = x86_fsbase_read_cpu(); \ (pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \ } while (0); /* I'm not sure if we can use '-' here */ #define ELF_PLATFORM ("x86_64") extern void set_personality_64bit(void); extern int force_personality32; #endif /* !CONFIG_X86_32 */ #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 /* * This is the base location for PIE (ET_DYN with INTERP) loads. On * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ (DEFAULT_MAP_WINDOW / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, but it's not easy, and we've already done it here. */ #define ELF_HWCAP (boot_cpu_data.x86_capability[CPUID_1_EDX]) extern u32 elf_hwcap2; /* * HWCAP2 supplies mask with kernel enabled CPU features, so that * the application can discover that it can safely use them. * The bits are defined in uapi/asm/hwcap2.h. */ #define ELF_HWCAP2 (elf_hwcap2) /* This yields a string that ld.so will use to load implementation specific libraries for optimization. This is more specific in intent than poking at uname or /proc/cpuinfo. For the moment, we have only optimizations for the Intel generations, but that could change... */ #define SET_PERSONALITY(ex) set_personality_64bit() /* * An executable for which elf_read_implies_exec() returns TRUE will * have the READ_IMPLIES_EXEC personality flag set automatically. * * The decision process for determining the results are: * * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | * ELF: | | | | * ---------------------|------------|------------------|----------------| * missing PT_GNU_STACK | exec-all | exec-all | exec-none | * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | * * exec-all : all PROT_READ user mappings are executable, except when * backed by files on a noexec-filesystem. * exec-none : only PROT_EXEC user mappings are executable. * exec-stack: only the stack and PROT_EXEC user mappings are executable. * * *this column has no architectural effect: NX markings are ignored by * hardware, but may have behavioral effects when "wants X" collides with * "cannot be X" constraints in memory permission flags, as in * https://lkml.kernel.org/r/20190418055759.GA3155@mellanox.com * */ #define elf_read_implies_exec(ex, executable_stack) \ (mmap_is_ia32() && executable_stack == EXSTACK_DEFAULT) struct task_struct; #define ARCH_DLINFO_IA32 \ do { \ if (VDSO_CURRENT_BASE) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) /* * True on X86_32 or when emulating IA32 on X86_64 */ static inline int mmap_is_ia32(void) { return IS_ENABLED(CONFIG_X86_32) || (IS_ENABLED(CONFIG_COMPAT) && test_thread_flag(TIF_ADDR32)); } extern unsigned long task_size_32bit(void); extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); extern unsigned long get_sigframe_size(void); #ifdef CONFIG_X86_32 #define __STACK_RND_MASK(is32bit) (0x7ff) #define STACK_RND_MASK (0x7ff) #define ARCH_DLINFO ARCH_DLINFO_IA32 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #else /* CONFIG_X86_32 */ /* 1GB for 64bit, 8MB for 32bit */ #define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) #define ARCH_DLINFO \ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ #define ARCH_DLINFO_X32 \ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) #define AT_SYSINFO 32 #define COMPAT_ARCH_DLINFO \ if (exec->e_machine == EM_X86_64) \ ARCH_DLINFO_X32; \ else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \ ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) #endif /* !CONFIG_X86_32 */ #define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) #define VDSO_ENTRY \ ((unsigned long)current->mm->context.vdso + \ vdso_image_32.sym___kernel_vsyscall) struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, bool x32); #define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ compat_arch_setup_additional_pages(bprm, interpreter, \ (ex->e_machine == EM_X86_64)) extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs); /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), ALIGN_VA_64 = BIT(1), }; struct va_alignment { int flags; unsigned long mask; unsigned long bits; } ____cacheline_aligned; extern struct va_alignment va_align; #endif /* _ASM_X86_ELF_H */
10508 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM kmem #if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include <linux/types.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(kmem_cache_alloc, TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, gfp_t gfp_flags, int node), TP_ARGS(call_site, ptr, s, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) __field( int, node ) __field( bool, accounted ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __entry->bytes_req = s->object_size; __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; __entry->accounted = IS_ENABLED(CONFIG_MEMCG) ? ((gfp_flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)) : false; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, __entry->accounted ? "true" : "false") ); TRACE_EVENT(kmalloc, TP_PROTO(unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags, int node), TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) __field( int, node ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __entry->bytes_req = bytes_req; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), __entry->node, (IS_ENABLED(CONFIG_MEMCG) && (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); TRACE_EVENT(kfree, TP_PROTO(unsigned long call_site, const void *ptr), TP_ARGS(call_site, ptr), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; ), TP_printk("call_site=%pS ptr=%p", (void *)__entry->call_site, __entry->ptr) ); TRACE_EVENT(kmem_cache_free, TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s), TP_ARGS(call_site, ptr, s), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) __string( name, s->name ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; __assign_str(name); ), TP_printk("call_site=%pS ptr=%p name=%s", (void *)__entry->call_site, __entry->ptr, __get_str(name)) ); TRACE_EVENT(mm_page_free, TP_PROTO(struct page *page, unsigned int order), TP_ARGS(page, order), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->order = order; ), TP_printk("page=%p pfn=0x%lx order=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order) ); TRACE_EVENT(mm_page_free_batched, TP_PROTO(struct page *page), TP_ARGS(page), TP_STRUCT__entry( __field( unsigned long, pfn ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); ), TP_printk("page=%p pfn=0x%lx order=0", pfn_to_page(__entry->pfn), __entry->pfn) ); TRACE_EVENT(mm_page_alloc, TP_PROTO(struct page *page, unsigned int order, gfp_t gfp_flags, int migratetype), TP_ARGS(page, order, gfp_flags, migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( unsigned long, gfp_flags ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->migratetype = migratetype; ), TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, __entry->migratetype, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_page, TP_PROTO(struct page *page, unsigned int order, int migratetype, int percpu_refill), TP_ARGS(page, order, migratetype, percpu_refill), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( int, migratetype ) __field( int, percpu_refill ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->migratetype = migratetype; __entry->percpu_refill = percpu_refill; ), TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, __entry->migratetype, __entry->percpu_refill) ); DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked, TP_PROTO(struct page *page, unsigned int order, int migratetype, int percpu_refill), TP_ARGS(page, order, migratetype, percpu_refill) ); TRACE_EVENT(mm_page_pcpu_drain, TP_PROTO(struct page *page, unsigned int order, int migratetype), TP_ARGS(page, order, migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( unsigned int, order ) __field( int, migratetype ) ), TP_fast_assign( __entry->pfn = page ? page_to_pfn(page) : -1UL; __entry->order = order; __entry->migratetype = migratetype; ), TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order, __entry->migratetype) ); TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, int alloc_order, int fallback_order, int alloc_migratetype, int fallback_migratetype), TP_ARGS(page, alloc_order, fallback_order, alloc_migratetype, fallback_migratetype), TP_STRUCT__entry( __field( unsigned long, pfn ) __field( int, alloc_order ) __field( int, fallback_order ) __field( int, alloc_migratetype ) __field( int, fallback_migratetype ) __field( int, change_ownership ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->alloc_order = alloc_order; __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; __entry->change_ownership = (alloc_migratetype == get_pageblock_migratetype(page)); ), TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->alloc_order, __entry->fallback_order, pageblock_order, __entry->alloc_migratetype, __entry->fallback_migratetype, __entry->fallback_order < pageblock_order, __entry->change_ownership) ); TRACE_EVENT(mm_alloc_contig_migrate_range_info, TP_PROTO(unsigned long start, unsigned long end, unsigned long nr_migrated, unsigned long nr_reclaimed, unsigned long nr_mapped, int migratetype), TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype), TP_STRUCT__entry( __field(unsigned long, start) __field(unsigned long, end) __field(unsigned long, nr_migrated) __field(unsigned long, nr_reclaimed) __field(unsigned long, nr_mapped) __field(int, migratetype) ), TP_fast_assign( __entry->start = start; __entry->end = end; __entry->nr_migrated = nr_migrated; __entry->nr_reclaimed = nr_reclaimed; __entry->nr_mapped = nr_mapped; __entry->migratetype = migratetype; ), TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu", __entry->start, __entry->end, __entry->migratetype, __entry->nr_migrated, __entry->nr_reclaimed, __entry->nr_mapped) ); TRACE_EVENT(mm_setup_per_zone_wmarks, TP_PROTO(struct zone *zone), TP_ARGS(zone), TP_STRUCT__entry( __field(int, node_id) __string(name, zone->name) __field(unsigned long, watermark_min) __field(unsigned long, watermark_low) __field(unsigned long, watermark_high) __field(unsigned long, watermark_promo) ), TP_fast_assign( __entry->node_id = zone->zone_pgdat->node_id; __assign_str(name); __entry->watermark_min = zone->_watermark[WMARK_MIN]; __entry->watermark_low = zone->_watermark[WMARK_LOW]; __entry->watermark_high = zone->_watermark[WMARK_HIGH]; __entry->watermark_promo = zone->_watermark[WMARK_PROMO]; ), TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu", __entry->node_id, __get_str(name), __entry->watermark_min, __entry->watermark_low, __entry->watermark_high, __entry->watermark_promo) ); TRACE_EVENT(mm_setup_per_zone_lowmem_reserve, TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve), TP_ARGS(zone, upper_zone, lowmem_reserve), TP_STRUCT__entry( __field(int, node_id) __string(name, zone->name) __string(upper_name, upper_zone->name) __field(long, lowmem_reserve) ), TP_fast_assign( __entry->node_id = zone->zone_pgdat->node_id; __assign_str(name); __assign_str(upper_name); __entry->lowmem_reserve = lowmem_reserve; ), TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld", __entry->node_id, __get_str(name), __get_str(upper_name), __entry->lowmem_reserve) ); TRACE_EVENT(mm_calculate_totalreserve_pages, TP_PROTO(unsigned long totalreserve_pages), TP_ARGS(totalreserve_pages), TP_STRUCT__entry( __field(unsigned long, totalreserve_pages) ), TP_fast_assign( __entry->totalreserve_pages = totalreserve_pages; ), TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages) ); /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ #ifndef __PTR_TO_HASHVAL static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) { int ret; unsigned long hashval; ret = ptr_to_hashval(ptr, &hashval); if (ret) return 0; /* The hashed value is only 32-bit */ return (unsigned int)hashval; } #define __PTR_TO_HASHVAL #endif #define TRACE_MM_PAGES \ EM(MM_FILEPAGES) \ EM(MM_ANONPAGES) \ EM(MM_SWAPENTS) \ EMe(MM_SHMEMPAGES) #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); TRACE_MM_PAGES #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, int member), TP_ARGS(mm, member), TP_STRUCT__entry( __field(unsigned int, mm_id) __field(unsigned int, curr) __field(int, member) __field(long, size) ), TP_fast_assign( __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", __entry->mm_id, __entry->curr, __print_symbolic(__entry->member, TRACE_MM_PAGES), __entry->size) ); #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
95 95 95 95 7 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Public Key Signature Algorithm * * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/sig.h> #include <linux/cryptouser.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" static void crypto_sig_exit_tfm(struct crypto_tfm *tfm) { struct crypto_sig *sig = __crypto_sig_tfm(tfm); struct sig_alg *alg = crypto_sig_alg(sig); alg->exit(sig); } static int crypto_sig_init_tfm(struct crypto_tfm *tfm) { struct crypto_sig *sig = __crypto_sig_tfm(tfm); struct sig_alg *alg = crypto_sig_alg(sig); if (alg->exit) sig->base.exit = crypto_sig_exit_tfm; if (alg->init) return alg->init(sig); return 0; } static void crypto_sig_free_instance(struct crypto_instance *inst) { struct sig_instance *sig = sig_instance(inst); sig->free(sig); } static void __maybe_unused crypto_sig_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : sig\n"); } static int __maybe_unused crypto_sig_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_sig rsig = {}; strscpy(rsig.type, "sig", sizeof(rsig.type)); return nla_put(skb, CRYPTOCFGA_REPORT_SIG, sizeof(rsig), &rsig); } static const struct crypto_type crypto_sig_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_sig_init_tfm, .free = crypto_sig_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_sig_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_sig_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SIG, .tfmsize = offsetof(struct crypto_sig, base), .algsize = offsetof(struct sig_alg, base), }; struct crypto_sig *crypto_alloc_sig(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_sig_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_sig); static int sig_default_sign(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { return -ENOSYS; } static int sig_default_verify(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *dst, unsigned int dlen) { return -ENOSYS; } static int sig_default_set_key(struct crypto_sig *tfm, const void *key, unsigned int keylen) { return -ENOSYS; } static unsigned int sig_default_size(struct crypto_sig *tfm) { return DIV_ROUND_UP_POW2(crypto_sig_keysize(tfm), BITS_PER_BYTE); } static int sig_prepare_alg(struct sig_alg *alg) { struct crypto_alg *base = &alg->base; if (!alg->sign) alg->sign = sig_default_sign; if (!alg->verify) alg->verify = sig_default_verify; if (!alg->set_priv_key) alg->set_priv_key = sig_default_set_key; if (!alg->set_pub_key) return -EINVAL; if (!alg->key_size) return -EINVAL; if (!alg->max_size) alg->max_size = sig_default_size; if (!alg->digest_size) alg->digest_size = sig_default_size; base->cra_type = &crypto_sig_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_SIG; return 0; } int crypto_register_sig(struct sig_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = sig_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_sig); void crypto_unregister_sig(struct sig_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_sig); int sig_register_instance(struct crypto_template *tmpl, struct sig_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = sig_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, sig_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(sig_register_instance); int crypto_grab_sig(struct crypto_sig_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_sig_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_sig); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Public Key Signature Algorithms");
35 17 18 18 4 14 6 5 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* vcan.c - Virtual CAN interface * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/can-ml.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/slab.h> #include <net/rtnetlink.h> #define DRV_NAME "vcan" MODULE_DESCRIPTION("virtual CAN interface"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); /* CAN test feature: * Enable the echo on driver level for testing the CAN core echo modes. * See Documentation/networking/can.rst for details. */ static bool echo; /* echo testing. Default: 0 (Off) */ module_param(echo, bool, 0444); MODULE_PARM_DESC(echo, "Echo sent frames (for testing). Default: 0 (Off)"); static void vcan_rx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; stats->rx_packets++; stats->rx_bytes += can_skb_get_data_len(skb); skb->pkt_type = PACKET_BROADCAST; skb->dev = dev; skb->ip_summed = CHECKSUM_UNNECESSARY; netif_rx(skb); } static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len; int loop; if (can_dropped_invalid_skb(dev, skb)) return NETDEV_TX_OK; len = can_skb_get_data_len(skb); stats->tx_packets++; stats->tx_bytes += len; /* set flag whether this packet has to be looped back */ loop = skb->pkt_type == PACKET_LOOPBACK; skb_tx_timestamp(skb); if (!echo) { /* no echo handling available inside this driver */ if (loop) { /* only count the packets here, because the * CAN core already did the echo for us */ stats->rx_packets++; stats->rx_bytes += len; } consume_skb(skb); return NETDEV_TX_OK; } /* perform standard echo handling for CAN network interfaces */ if (loop) { skb = can_create_echo_skb(skb); if (!skb) return NETDEV_TX_OK; /* receive with packet counting */ vcan_rx(skb, dev); } else { /* no looped packets => no counting */ consume_skb(skb); } return NETDEV_TX_OK; } static int vcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU && !can_is_canxl_dev_mtu(new_mtu)) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static const struct net_device_ops vcan_netdev_ops = { .ndo_start_xmit = vcan_tx, .ndo_change_mtu = vcan_change_mtu, }; static const struct ethtool_ops vcan_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void vcan_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; can_set_ml_priv(dev, netdev_priv(dev)); /* set flags according to driver capabilities */ if (echo) dev->flags |= IFF_ECHO; dev->netdev_ops = &vcan_netdev_ops; dev->ethtool_ops = &vcan_ethtool_ops; dev->needs_free_netdev = true; } static struct rtnl_link_ops vcan_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct can_ml_priv), .setup = vcan_setup, }; static __init int vcan_init_module(void) { pr_info("Virtual CAN interface driver\n"); if (echo) pr_info("enabled echo on driver level.\n"); return rtnl_link_register(&vcan_link_ops); } static __exit void vcan_cleanup_module(void) { rtnl_link_unregister(&vcan_link_ops); } module_init(vcan_init_module); module_exit(vcan_cleanup_module);
21 6084 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } /* * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32-bit cannot do that because the store of 'tick_next_period' * consists of two 32-bit stores, and the first store could be * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above, and ensures that the update to 'jiffies_64' is * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } static inline int tick_sched_flag_test(struct tick_sched *ts, unsigned long flag) { return !!(ts->flags & flag); } static inline void tick_sched_flag_set(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags |= flag; } static inline void tick_sched_flag_clear(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags &= ~flag; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif WRITE_ONCE(tick_do_timer_cpu, cpu); tick_cpu = cpu; } /* Check if jiffies need an update */ if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too many ticks. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call when we are not in IRQ context and have * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(&current->tick_dep_mask)) return false; if (check_tick_dependency(&current->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask * * XXX given a task picks up the dependency on schedule(), should we * only care about tasks that are currently on the CPU instead of all * that are on the runqueue? * * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { if (atomic_read(&current->tick_dep_mask) || atomic_read(&current->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses IRQ work to drive the tick rescheduling on safe * locking contexts. But then we need IRQ work to raise its own * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* #ifdef CONFIG_NO_HZ_FULL */ /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* * Read jiffies and the time when jiffies were updated last */ u64 get_jiffies_update(unsigned long *basej) { unsigned long basejiff; unsigned int seq; u64 basemono; do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); *basej = basejiff; return basemono; } /** * tick_nohz_next_event() - return the clock monotonic based next event * @ts: pointer to tick_sched struct * @cpu: CPU number * * Return: * *%0 - When the next event is a maximum of TICK_NSEC in the future * and the tick is not stopped yet * *%next_event - Next event based on clock monotonic */ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that, check whether the local timer softirq is * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* Make sure next_tick is never before basemono! */ if (WARN_ON_ONCE(basemono > next_tick)) next_tick = basemono; /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu != cpu && (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * Now the tick should be stopped definitely - so the timer base needs * to be marked idle as well to not miss a newly queued timer. */ expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); if (expires > ts->timer_expires) { /* * This path could only happen when the first timer was removed * between calculating the possible sleep length and now (when * high resolution mode is not active, timer could also be a * hrtimer). * * We have to stick to the original calculated expiry value to * not stop the tick for too long with a shallow C-state (which * was programmed by cpuidle because of an early next expiration * value). */ expires = ts->timer_expires; } /* If the timer base is not idle, retain the not yet stopped tick. */ if (!timer_idle) return; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here, the jiffies might be stale and * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu == cpu) { WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } /* Skip reprogram of event if it's not changed */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * tick_nohz_stop_tick() can be called several times before * tick_nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the first * call we save the current tick time, so we can restart the * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); tick_sched_flag_set(ts, TS_FLAG_STOPPED); trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, expires); tick_program_event(expires, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* Cancel the scheduled timer and restore the tick: */ tick_sched_flag_clear(ts, TS_FLAG_STOPPED); tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_full_stop_tick(ts, cpu); else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } if (ratelimit >= 10) return false; /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(cpu_is_offline(cpu)); if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { int tick_cpu = READ_ONCE(tick_do_timer_cpu); /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_set(ts, TS_FLAG_INIDLE); tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - Notify the tick about IRQ exit * * A timer may have been added/modified/deleted either by the current IRQ, * or by another place using this IRQ as a notification. This IRQ may have * also updated the RCU callback list. These events may require a * re-evaluation of the next tick. Depending on the context: * * 1) If the CPU is idle and no resched is pending, just proceed with idle * time accounting. The next tick will be re-evaluated on the next idle * loop iteration. * * 2) If the CPU is nohz_full: * * 2.1) If there is any tick dependency, restart the tick if stopped. * * 2.2) If there is no tick dependency, (re-)evaluate the next tick and * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run * * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled * * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. * * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. * * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. update_process_times() would miss the * time we slept, as it does only a 1 tick accounting. * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - Update the tick upon idle task exit * * When the idle task exits, update the tick depending on the * following situations: * * 1) If the CPU is not in nohz_full mode (most cases), then * restart the tick. * * 2) If the CPU is in nohz_full mode (corner case): * 2.1) If the tick can be kept stopped (no tick dependencies) * then re-evaluate the next tick and try to keep it stopped * as long as possible. * 2.2) If the tick has dependencies, restart the tick. * */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_clear(ts, TS_FLAG_INIDLE); idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * In low-resolution mode, the tick handler must be implemented directly * at the clockevent level. hrtimer can't be used instead, because its * infrastructure actually relies on the tick itself as a backend in * low-resolution mode (see hrtimer_run_queues()). */ static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts) { if (!tick_nohz_enabled) return; tick_sched_flag_set(ts, TS_FLAG_NOHZ); /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* * Recycle the hrtimer in 'ts', so we can share the * highres code. */ tick_setup_sched_timer(false); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) return; now = ktime_get(); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts); } /* * Shut down the tick and make sure the CPU won't try to retake the timekeeping * duty before disabling IRQs in idle for the last time. */ void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check if a change happened, which makes oneshot possible. * * Called cyclically from the hrtimer softirq (driven by the timer * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; }
6 1026 926 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _DELAYED_CALL_H #define _DELAYED_CALL_H /* * Poor man's closures; I wish we could've done them sanely polymorphic, * but... */ struct delayed_call { void (*fn)(void *); void *arg; }; #define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL} /* I really wish we had closures with sane typechecking... */ static inline void set_delayed_call(struct delayed_call *call, void (*fn)(void *), void *arg) { call->fn = fn; call->arg = arg; } static inline void do_delayed_call(struct delayed_call *call) { if (call->fn) call->fn(call->arg); } static inline void clear_delayed_call(struct delayed_call *call) { call->fn = NULL; } #endif
5 2 5 4 1 3 5 1 3 2 2 1 7 4 1 7 8 1 1 3 4 52 49 6 52 4 6 46 46 46 3 5 6 44 42 40 3 44 44 2 42 12 12 12 9 9 9 9 6 67 1 3 1 50 9 3 2 1 1 57 2 43 9 2 2 10 3 9 9 46 55 1 55 1 55 55 1 9 9 7 1 8 1 9 498 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB * * Refer to: * draft-ietf-forces-interfelfb-03 * and * netdev01 paper: * "Distributing Linux Traffic Control Classifier-Action * Subsystem" * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai * * copyright Jamal Hadi Salim (2015) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ife.h> #include <net/tc_act/tc_ife.h> #include <linux/etherdevice.h> #include <net/ife.h> #include <net/tc_wrapper.h> static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, [TCA_IFE_DMAC] = { .len = ETH_ALEN}, [TCA_IFE_SMAC] = { .len = ETH_ALEN}, [TCA_IFE_TYPE] = { .type = NLA_U16}, }; int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) { u16 edata = 0; if (mi->metaval) edata = *(u16 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htons(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u16); int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u32); int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+V == 2+2+4 */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u32); int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+(V) == 2+2+(2+2bytepad) */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u16); int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) { u32 edata = metaval; if (mi->metaval) edata = *(u32 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htonl(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u32); int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u16); int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u32), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u16), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u16); void ife_release_meta_gen(struct tcf_meta_info *mi) { kfree(mi->metaval); } EXPORT_SYMBOL_GPL(ife_release_meta_gen); int ife_validate_meta_u32(void *val, int len) { if (len == sizeof(u32)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u32); int ife_validate_meta_u16(void *val, int len) { /* length will not include padding */ if (len == sizeof(u16)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u16); static LIST_HEAD(ifeoplist); static DEFINE_RWLOCK(ife_mod_lock); static struct tcf_meta_ops *find_ife_oplist(u16 metaid) { struct tcf_meta_ops *o; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { if (o->metaid == metaid) { if (!try_module_get(o->owner)) o = NULL; read_unlock(&ife_mod_lock); return o; } } read_unlock(&ife_mod_lock); return NULL; } int register_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; if (!mops->metaid || !mops->metatype || !mops->name || !mops->check_presence || !mops->encode || !mops->decode || !mops->get || !mops->alloc) return -EINVAL; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid || (strcmp(mops->name, m->name) == 0)) { write_unlock(&ife_mod_lock); return -EEXIST; } } if (!mops->release) mops->release = ife_release_meta_gen; list_add_tail(&mops->list, &ifeoplist); write_unlock(&ife_mod_lock); return 0; } EXPORT_SYMBOL_GPL(unregister_ife_op); int unregister_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; int err = -ENOENT; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid) { list_del(&mops->list); err = 0; break; } } write_unlock(&ife_mod_lock); return err; } EXPORT_SYMBOL_GPL(register_ife_op); static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) { int ret = 0; /* XXX: unfortunately cant use nla_policy at this point * because a length of 0 is valid in the case of * "allow". "use" semantics do enforce for proper * length and i couldve use nla_policy but it makes it hard * to use it just for that.. */ if (ops->validate) return ops->validate(val, len); if (ops->metatype == NLA_U32) ret = ife_validate_meta_u32(val, len); else if (ops->metatype == NLA_U16) ret = ife_validate_meta_u16(val, len); return ret; } #ifdef CONFIG_MODULES static const char *ife_meta_id2name(u32 metaid) { switch (metaid) { case IFE_META_SKBMARK: return "skbmark"; case IFE_META_PRIO: return "skbprio"; case IFE_META_TCINDEX: return "tcindex"; default: return "unknown"; } } #endif /* called when adding new meta information */ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; if (!ops) { ret = -ENOENT; #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); request_module("ife-meta-%s", ife_meta_id2name(metaid)); if (rtnl_held) rtnl_lock(); ops = find_ife_oplist(metaid); #endif } if (ops) { ret = 0; if (len) ret = ife_validate_metatype(ops, val, len); module_put(ops->owner); } return ret; } /* called when adding new meta information */ static int __add_metainfo(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool atomic, bool exists) { struct tcf_meta_info *mi = NULL; int ret = 0; mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL); if (!mi) return -ENOMEM; mi->metaid = metaid; mi->ops = ops; if (len > 0) { ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL); if (ret != 0) { kfree(mi); return ret; } } if (exists) spin_lock_bh(&ife->tcf_lock); list_add_tail(&mi->metalist, &ife->metalist); if (exists) spin_unlock_bh(&ife->tcf_lock); return ret; } static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, bool exists) { int ret; if (!try_module_get(ops->owner)) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); if (ret) module_put(ops->owner); return ret; } static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool exists) { const struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret; if (!ops) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists); if (ret) /*put back what find_ife_oplist took */ module_put(ops->owner); return ret; } static int use_all_metadata(struct tcf_ife_info *ife, bool exists) { struct tcf_meta_ops *o; int rc = 0; int installed = 0; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); if (rc == 0) installed += 1; } read_unlock(&ife_mod_lock); if (installed) return 0; else return -EINVAL; } static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e; struct nlattr *nest; unsigned char *b = skb_tail_pointer(skb); int total_encoded = 0; /*can only happen on decode */ if (list_empty(&ife->metalist)) return 0; nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; list_for_each_entry(e, &ife->metalist, metalist) { if (!e->ops->get(skb, e)) total_encoded += 1; } if (!total_encoded) goto out_nlmsg_trim; nla_nest_end(skb, nest); return 0; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } /* under ife->tcf_lock */ static void _tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { list_del(&e->metalist); if (e->metaval) { if (e->ops->release) e->ops->release(e); else kfree(e->metaval); } module_put(e->ops->owner); kfree(e); } } static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a); spin_unlock_bh(&ife->tcf_lock); p = rcu_dereference_protected(ife->params, 1); if (p) kfree_rcu(p, rcu); } static int load_metalist(struct nlattr **tb, bool rtnl_held) { int i; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { void *val = nla_data(tb[i]); int len = nla_len(tb[i]); int rc; rc = load_metaops_and_vet(i, val, len, rtnl_held); if (rc != 0) return rc; } } return 0; } static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, bool exists, bool rtnl_held) { int len = 0; int rc = 0; int i = 0; void *val; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { val = nla_data(tb[i]); len = nla_len(tb[i]); rc = add_metainfo(ife, i, val, len, exists); if (rc) return rc; } } return rc; } static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; u8 *daddr = NULL; u8 *saddr = NULL; bool exists = false; int ret = 0; u32 index; int err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy, NULL); if (err < 0) return err; if (!tb[TCA_IFE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_IFE_PARMS]); /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because * they cannot run as the same time. Check on all other values which * are not supported right now. */ if (parm->flags & ~IFE_ENCODE) return -EINVAL; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, NULL); if (err) { kfree(p); return err; } err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) { kfree(p); return err; } } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) { kfree(p); return err; } exists = err; if (exists && bind) { kfree(p); return ACT_P_BOUND; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); kfree(p); return -EEXIST; } ife = to_ife(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) ife_type = nla_get_u16(tb[TCA_IFE_TYPE]); if (tb[TCA_IFE_DMAC]) daddr = nla_data(tb[TCA_IFE_DMAC]); if (tb[TCA_IFE_SMAC]) saddr = nla_data(tb[TCA_IFE_SMAC]); } if (parm->flags & IFE_ENCODE) { if (daddr) ether_addr_copy(p->eth_dst, daddr); else eth_zero_addr(p->eth_dst); if (saddr) ether_addr_copy(p->eth_src, saddr); else eth_zero_addr(p->eth_src); p->eth_type = ife_type; } if (tb[TCA_IFE_METALST]) { err = populate_metalist(ife, tb2, exists, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) goto metadata_parse_err; } else { /* if no passed metadata allow list or passed allow-all * then here we process by adding as many supported metadatum * as we can. You better have at least one else we are * going to bail out */ err = use_all_metadata(ife, exists); if (err) goto metadata_parse_err; } if (exists) spin_lock_bh(&ife->tcf_lock); /* protected by tcf_lock when modifying existing action */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(ife->params, p, 1); if (exists) spin_unlock_bh(&ife->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; metadata_parse_err: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: kfree(p); tcf_idr_release(*a, bind); return err; } static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; struct tc_ife opt = { .index = ife->tcf_index, .refcnt = refcount_read(&ife->tcf_refcnt) - ref, .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; p = rcu_dereference_protected(ife->params, lockdep_is_held(&ife->tcf_lock)); opt.flags = p->flags; if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &ife->tcf_tm); if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; if (!is_zero_ether_addr(p->eth_dst)) { if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } if (!is_zero_ether_addr(p->eth_src)) { if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { /*ignore failure to dump metalist */ pr_info("Failed to dump metalist\n"); } spin_unlock_bh(&ife->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&ife->tcf_lock); nlmsg_trim(skb, b); return -1; } static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; /* XXX: use hash to speed up */ list_for_each_entry(e, &ife->metalist, metalist) { if (metaid == e->metaid) { if (e->ops) { /* We check for decode presence already */ return e->ops->decode(skb, mdata, mlen); } } } return -ENOENT; } static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; u8 *ifehdr_end; u8 *tlv_data; u16 metalen; bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } ifehdr_end = tlv_data + metalen; for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) { u8 *curr_data; u16 mtype; u16 dlen; curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype, &dlen, NULL); if (!curr_data) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skb->protocol = eth_type_trans(skb, skb->dev); skb_reset_network_header(skb); return action; } /*XXX: check if we can do this at install time instead of current * send data path **/ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e, *n; int tot_run_sz = 0, run_sz = 0; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { if (e->ops->check_presence) { run_sz = e->ops->check_presence(skb, e); tot_run_sz += run_sz; } } return tot_run_sz; } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ struct tcf_meta_info *e; /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA where ORIGDATA = original ethernet header ... */ u16 metalen = ife_get_sz(skb, ife); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = 0; int new_len = skb->len + hdrm; bool exceed_mtu = false; void *ife_meta; int err = 0; if (!skb_at_tc_ingress(skb)) { if (new_len > skb->dev->mtu) exceed_mtu = true; } bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); ife_meta = ife_encode(skb, metalen); spin_lock(&ife->tcf_lock); /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... */ list_for_each_entry(e, &ife->metalist, metalist) { if (e->ops->encode) { err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { /* too corrupt to keep around if overwritten */ spin_unlock(&ife->tcf_lock); qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; } spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(p->eth_src)) ether_addr_copy(oethh->h_source, p->eth_src); if (!is_zero_ether_addr(p->eth_dst)) ether_addr_copy(oethh->h_dest, p->eth_dst); oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); return action; } TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; int ret; p = rcu_dereference_bh(ife->params); if (p->flags & IFE_ENCODE) { ret = tcf_ife_encode(skb, a, res, p); return ret; } return tcf_ife_decode(skb, a, res); } static struct tc_action_ops act_ife_ops = { .kind = "ife", .id = TCA_ID_IFE, .owner = THIS_MODULE, .act = tcf_ife_act, .dump = tcf_ife_dump, .cleanup = tcf_ife_cleanup, .init = tcf_ife_init, .size = sizeof(struct tcf_ife_info), }; MODULE_ALIAS_NET_ACT("ife"); static __net_init int ife_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); return tc_action_net_init(net, tn, &act_ife_ops); } static void __net_exit ife_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_ife_ops.net_id); } static struct pernet_operations ife_net_ops = { .init = ife_init_net, .exit_batch = ife_exit_net, .id = &act_ife_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init ife_init_module(void) { return tcf_register_action(&act_ife_ops, &ife_net_ops); } static void __exit ife_cleanup_module(void) { tcf_unregister_action(&act_ife_ops, &ife_net_ops); } module_init(ife_init_module); module_exit(ife_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE LFB action"); MODULE_LICENSE("GPL");
755 536 537 535 537 536 121 121 61 18 7 1 7 1 4 1 7 314 299 18 290 98 98 301 112 1 16 113 113 1 1 112 61 31 3 40 20 22 40 40 25 21 61 31 1 60 3 17 1 2 60 56 4 19 18 1 18 1 18 1 70 62 61 19 1 17 17 3 9 3 6 6 7 85 85 2 6 87 1 7 79 8 6 6 9 3 8 18 1 17 18 18 17 19 15 5 1 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This abstraction carries sctp events to the ULP (sockets). * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/busy_poll.h> #include <net/sctp/structs.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *); static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *, struct sctp_ulpevent *); static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq); /* 1st Level Abstractions */ /* Initialize a ULP queue from a block of memory. */ void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc) { memset(ulpq, 0, sizeof(struct sctp_ulpq)); ulpq->asoc = asoc; skb_queue_head_init(&ulpq->reasm); skb_queue_head_init(&ulpq->reasm_uo); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0; } /* Flush the reassembly and ordering queues. */ void sctp_ulpq_flush(struct sctp_ulpq *ulpq) { struct sk_buff *skb; struct sctp_ulpevent *event; while ((skb = __skb_dequeue(&ulpq->lobby)) != NULL) { event = sctp_skb2event(skb); sctp_ulpevent_free(event); } while ((skb = __skb_dequeue(&ulpq->reasm)) != NULL) { event = sctp_skb2event(skb); sctp_ulpevent_free(event); } while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) { event = sctp_skb2event(skb); sctp_ulpevent_free(event); } } /* Dispose of a ulpqueue. */ void sctp_ulpq_free(struct sctp_ulpq *ulpq) { sctp_ulpq_flush(ulpq); } /* Process an incoming DATA chunk. */ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sk_buff_head temp; struct sctp_ulpevent *event; int event_eor = 0; /* Create an event from the incoming chunk. */ event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->ssn = ntohs(chunk->subh.data_hdr->ssn); event->ppid = chunk->subh.data_hdr->ppid; /* Do reassembly if needed. */ event = sctp_ulpq_reasm(ulpq, event); /* Do ordering if needed. */ if (event) { /* Create a temporary list to collect chunks on. */ skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_ulpq_order(ulpq, event); } /* Send event to the ULP. 'event' is the sctp_ulpevent for * very first SKB on the 'temp' list. */ if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_ulpq_tail_event(ulpq, &temp); } return event_eor; } /* Add a new event for propagation to the ULP. */ /* Clear the partial delivery mode for this socket. Note: This * assumes that no association is currently in partial delivery mode. */ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc) { struct sctp_sock *sp = sctp_sk(sk); if (atomic_dec_and_test(&sp->pd_mode)) { /* This means there are no other associations in PD, so * we can go ahead and clear out the lobby in one shot */ if (!skb_queue_empty(&sp->pd_lobby)) { skb_queue_splice_tail_init(&sp->pd_lobby, &sk->sk_receive_queue); return 1; } } else { /* There are other associations in PD, so we only need to * pull stuff out of the lobby that belongs to the * associations that is exiting PD (all of its notifications * are posted here). */ if (!skb_queue_empty(&sp->pd_lobby) && asoc) { struct sk_buff *skb, *tmp; struct sctp_ulpevent *event; sctp_skb_for_each(skb, &sp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == asoc) { __skb_unlink(skb, &sp->pd_lobby); __skb_queue_tail(&sk->sk_receive_queue, skb); } } } } return 0; } /* Set the pd_mode on the socket and ulpq */ static void sctp_ulpq_set_pd(struct sctp_ulpq *ulpq) { struct sctp_sock *sp = sctp_sk(ulpq->asoc->base.sk); atomic_inc(&sp->pd_mode); ulpq->pd_mode = 1; } /* Clear the pd_mode and restart any pending messages waiting for delivery. */ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) { ulpq->pd_mode = 0; sctp_ulpq_reasm_drain(ulpq); return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc); } int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff_head *queue; struct sk_buff *skb; int clear_pd = 0; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); /* If the socket is just going to throw this away, do not * even try to deliver it. */ if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } /* Check if the user wishes to receive this event. */ if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; /* If we are in partial delivery mode, post to the lobby until * partial delivery is cleared, unless, of course _this_ is * the association the cause of the partial delivery. */ if (atomic_read(&sp->pd_mode) == 0) { queue = &sk->sk_receive_queue; } else { if (ulpq->pd_mode) { /* If the association is in partial delivery, we * need to finish delivering the partially processed * packet before passing any other data. This is * because we don't truly support stream interleaving. */ if ((event->msg_flags & MSG_NOTIFICATION) || (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK))) queue = &sp->pd_lobby; else { clear_pd = event->msg_flags & MSG_EOR; queue = &sk->sk_receive_queue; } } else { /* * If fragment interleave is enabled, we * can queue this to the receive queue instead * of the lobby. */ if (sp->frag_interleave) queue = &sk->sk_receive_queue; else queue = &sp->pd_lobby; } } skb_queue_splice_tail_init(skb_list, queue); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive * queue. */ if (clear_pd) sctp_ulpq_clear_pd(ulpq); if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { if (!sock_owned_by_user(sk)) sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } /* 2nd Level Abstractions */ /* Helper function to store chunks that need to be reassembled. */ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *pos; struct sctp_ulpevent *cevent; __u32 tsn, ctsn; tsn = event->tsn; /* See if it belongs at the end. */ pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } /* Short circuit just dropping it at the end. */ cevent = sctp_skb2event(pos); ctsn = cevent->tsn; if (TSN_lt(ctsn, tsn)) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } /* Find the right place in this list. We store them by TSN. */ skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; if (TSN_lt(tsn, ctsn)) break; } /* Insert before pos. */ __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); } /* Helper function to return an event corresponding to the reassembled * datagram. * This routine creates a re-assembled skb given the first and last skb's * as stored in the reassembly queue. The skb's may be non-linear if the sctp * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) { struct sk_buff *pos; struct sk_buff *new = NULL; struct sctp_ulpevent *event; struct sk_buff *pnext, *last; struct sk_buff *list = skb_shinfo(f_frag)->frag_list; /* Store the pointer to the 2nd skb */ if (f_frag == l_frag) pos = NULL; else pos = f_frag->next; /* Get the last skb in the f_frag's frag_list if present. */ for (last = list; list; last = list, list = list->next) ; /* Add the list of remaining fragments to the first fragments * frag_list. */ if (last) last->next = pos; else { if (skb_cloned(f_frag)) { /* This is a cloned skb, we can't just modify * the frag_list. We need a new skb to do that. * Instead of calling skb_unshare(), we'll do it * ourselves since we need to delay the free. */ new = skb_copy(f_frag, GFP_ATOMIC); if (!new) return NULL; /* try again later */ sctp_skb_set_owner_r(new, f_frag->sk); skb_shinfo(new)->frag_list = pos; } else skb_shinfo(f_frag)->frag_list = pos; } /* Remove the first fragment from the reassembly queue. */ __skb_unlink(f_frag, queue); /* if we did unshare, then free the old skb and re-assign */ if (new) { kfree_skb(f_frag); f_frag = new; } while (pos) { pnext = pos->next; /* Update the len and data_len fields of the first fragment. */ f_frag->len += pos->len; f_frag->data_len += pos->len; /* Remove the fragment from the reassembly queue. */ __skb_unlink(pos, queue); /* Break if we have reached the last fragment. */ if (pos == l_frag) break; pos->next = pnext; pos = pnext; } event = sctp_skb2event(f_frag); SCTP_INC_STATS(net, SCTP_MIB_REASMUSRMSGS); return event; } /* Helper function to check if an incoming chunk has filled up the last * missing fragment in a SCTP datagram and return the corresponding event. */ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ulpq) { struct sk_buff *pos; struct sctp_ulpevent *cevent; struct sk_buff *first_frag = NULL; __u32 ctsn, next_tsn; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; size_t pd_len = 0; struct sctp_association *asoc; u32 pd_point; /* Initialized to 0 just to avoid compiler warning message. Will * never be used with this value. It is referenced only after it * is set when we find the first fragment of a message. */ next_tsn = 0; /* The chunks are held in the reasm queue sorted by TSN. * Walk through the queue sequentially and look for a sequence of * fragmented chunks that complete a datagram. * 'first_frag' and next_tsn are reset when we find a chunk which * is the first fragment of a datagram. Once these 2 fields are set * we expect to find the remaining middle fragments and the last * fragment in order. If not, first_frag is reset to NULL and we * start the next pass when we find another first fragment. * * There is a potential to do partial delivery if user sets * SCTP_PARTIAL_DELIVERY_POINT option. Lets count some things here * to see if can do PD. */ skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: /* If this "FIRST_FRAG" is the first * element in the queue, then count it towards * possible PD. */ if (skb_queue_is_first(&ulpq->reasm, pos)) { pd_first = pos; pd_last = pos; pd_len = pos->len; } else { pd_first = NULL; pd_last = NULL; pd_len = 0; } first_frag = pos; next_tsn = ctsn + 1; break; case SCTP_DATA_MIDDLE_FRAG: if ((first_frag) && (ctsn == next_tsn)) { next_tsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else first_frag = NULL; break; case SCTP_DATA_LAST_FRAG: if (first_frag && (ctsn == next_tsn)) goto found; else first_frag = NULL; break; } } asoc = ulpq->asoc; if (pd_first) { /* Make sure we can enter partial deliver. * We can trigger partial delivery only if framgent * interleave is set, or the socket is not already * in partial delivery. */ if (!sctp_sk(asoc->base.sk)->frag_interleave && atomic_read(&sctp_sk(asoc->base.sk)->pd_mode)) goto done; cevent = sctp_skb2event(pd_first); pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) sctp_ulpq_set_pd(ulpq); } } done: return retval; found: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; goto done; } /* Retrieve the next set of fragments of a partial message. */ static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) { struct sk_buff *pos, *last_frag, *first_frag; struct sctp_ulpevent *cevent; __u32 ctsn, next_tsn; int is_last; struct sctp_ulpevent *retval; /* The chunks are held in the reasm queue sorted by TSN. * Walk through the queue sequentially and look for the first * sequence of fragmented chunks. */ if (skb_queue_empty(&ulpq->reasm)) return NULL; last_frag = first_frag = NULL; retval = NULL; next_tsn = 0; is_last = 0; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!first_frag) return NULL; goto done; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { first_frag = pos; next_tsn = ctsn + 1; last_frag = pos; } else if (next_tsn == ctsn) { next_tsn++; last_frag = pos; } else goto done; break; case SCTP_DATA_LAST_FRAG: if (!first_frag) first_frag = pos; else if (ctsn != next_tsn) goto done; last_frag = pos; is_last = 1; goto done; default: return NULL; } } /* We have the reassembled event. There is no need to look * further. */ done: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; return retval; } /* Helper function to reassemble chunks. Hold chunks on the reasm queue that * need reassembling. */ static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; /* Check if this is part of a fragmented message. */ if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_ulpq_store_reasm(ulpq, event); if (!ulpq->pd_mode) retval = sctp_ulpq_retrieve_reassembled(ulpq); else { __u32 ctsn, ctsnap; /* Do not even bother unless this is the next tsn to * be delivered. */ ctsn = event->tsn; ctsnap = sctp_tsnmap_get_ctsn(&ulpq->asoc->peer.tsn_map); if (TSN_lte(ctsn, ctsnap)) retval = sctp_ulpq_retrieve_partial(ulpq); } return retval; } /* Retrieve the first part (sequential fragments) for partial delivery. */ static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) { struct sk_buff *pos, *last_frag, *first_frag; struct sctp_ulpevent *cevent; __u32 ctsn, next_tsn; struct sctp_ulpevent *retval; /* The chunks are held in the reasm queue sorted by TSN. * Walk through the queue sequentially and look for a sequence of * fragmented chunks that start a datagram. */ if (skb_queue_empty(&ulpq->reasm)) return NULL; last_frag = first_frag = NULL; retval = NULL; next_tsn = 0; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!first_frag) { first_frag = pos; next_tsn = ctsn + 1; last_frag = pos; } else goto done; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) return NULL; if (ctsn == next_tsn) { next_tsn++; last_frag = pos; } else goto done; break; case SCTP_DATA_LAST_FRAG: if (!first_frag) return NULL; else goto done; break; default: return NULL; } } /* We have the reassembled event. There is no need to look * further. */ done: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); return retval; } /* * Flush out stale fragments from the reassembly queue when processing * a Forward TSN. * * RFC 3758, Section 3.6 * * After receiving and processing a FORWARD TSN, the data receiver MUST * take cautions in updating its re-assembly queue. The receiver MUST * remove any partially reassembled message, which is still missing one * or more TSNs earlier than or equal to the new cumulative TSN point. * In the event that the receiver has invoked the partial delivery API, * a notification SHOULD also be generated to inform the upper layer API * that the message being partially delivered will NOT be completed. */ void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn) { struct sk_buff *pos, *tmp; struct sctp_ulpevent *event; __u32 tsn; if (skb_queue_empty(&ulpq->reasm)) return; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { event = sctp_skb2event(pos); tsn = event->tsn; /* Since the entire message must be abandoned by the * sender (item A3 in Section 3.5, RFC 3758), we can * free all fragments on the list that are less then * or equal to ctsn_point */ if (TSN_lte(tsn, fwd_tsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } else break; } } /* * Drain the reassembly queue. If we just cleared parted delivery, it * is possible that the reassembly queue will contain already reassembled * messages. Retrieve any such messages and give them to the user. */ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) { struct sctp_ulpevent *event = NULL; if (skb_queue_empty(&ulpq->reasm)) return; while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); /* Do ordering if needed. */ if (event->msg_flags & MSG_EOR) event = sctp_ulpq_order(ulpq, event); /* Send event to the ULP. 'event' is the * sctp_ulpevent for very first SKB on the temp' list. */ if (event) sctp_ulpq_tail_event(ulpq, &temp); } } /* Helper function to gather skbs that have possibly become * ordered by an incoming chunk. */ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_stream *stream; __u16 sid, csid, cssn; sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; /* We are holding the chunks by stream, by SSN. */ sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; /* Have we gone too far? */ if (csid > sid) break; /* Have we not gone far enough? */ if (csid < sid) continue; if (cssn != sctp_ssn_peek(stream, in, sid)) break; /* Found it, so mark in the stream. */ sctp_ssn_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); /* Attach all gathered skbs to the event. */ __skb_queue_tail(event_list, pos); } } /* Helper function to store chunks needing ordering. */ static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *pos; struct sctp_ulpevent *cevent; __u16 sid, csid; __u16 ssn, cssn; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } sid = event->stream; ssn = event->ssn; cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; if (sid > csid) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if ((sid == csid) && SSN_lt(cssn, ssn)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } /* Find the right place in this list. We store them by * stream ID and then by SSN. */ skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; if (csid > sid) break; if (csid == sid && SSN_lt(ssn, cssn)) break; } /* Insert before pos. */ __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { __u16 sid, ssn; struct sctp_stream *stream; /* Check if this message needs ordering. */ if (event->msg_flags & SCTP_DATA_UNORDERED) return event; /* Note: The stream ID must be verified before this routine. */ sid = event->stream; ssn = event->ssn; stream = &ulpq->asoc->stream; /* Is this the expected SSN for this stream ID? */ if (ssn != sctp_ssn_peek(stream, in, sid)) { /* We've received something out of order, so find where it * needs to be placed. We order by stream and then by SSN. */ sctp_ulpq_store_ordered(ulpq, event); return NULL; } /* Mark that the next chunk has been found. */ sctp_ssn_next(stream, in, sid); /* Go find any other chunks that were waiting for * ordering. */ sctp_ulpq_retrieve_ordered(ulpq, event); return event; } /* Helper function to gather skbs that have possibly become * ordered by forward tsn skipping their dependencies. */ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_ulpevent *event; struct sctp_stream *stream; struct sk_buff_head temp; struct sk_buff_head *lobby = &ulpq->lobby; __u16 csid, cssn; stream = &ulpq->asoc->stream; /* We are holding the chunks by stream, by SSN. */ skb_queue_head_init(&temp); event = NULL; sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; /* Have we gone too far? */ if (csid > sid) break; /* Have we not gone far enough? */ if (csid < sid) continue; /* see if this ssn has been marked by skipping */ if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) /* Create a temporary list to collect chunks on. */ event = sctp_skb2event(pos); /* Attach all gathered skbs to the event. */ __skb_queue_tail(&temp, pos); } /* If we didn't reap any data, see if the next expected SSN * is next on the queue and if so, use that. */ if (event == NULL && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) { sctp_ssn_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } /* Send event to the ULP. 'event' is the sctp_ulpevent for * very first SKB on the 'temp' list. */ if (event) { /* see if we have more ordered that we can deliver */ sctp_ulpq_retrieve_ordered(ulpq, event); sctp_ulpq_tail_event(ulpq, &temp); } } /* Skip over an SSN. This is used during the processing of * Forwared TSN chunk to skip over the abandoned ordered data */ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) { struct sctp_stream *stream; /* Note: The stream ID must be verified before this routine. */ stream = &ulpq->asoc->stream; /* Is this an old SSN? If so ignore. */ if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid))) return; /* Mark that we are no longer expecting this SSN or lower. */ sctp_ssn_skip(stream, in, sid, ssn); /* Go find any other chunks that were waiting for * ordering and deliver them if needed. */ sctp_ulpq_reap_ordered(ulpq, sid); } __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list, __u16 needed) { __u16 freed = 0; __u32 tsn, last_tsn; struct sk_buff *skb, *flist, *last; struct sctp_ulpevent *event; struct sctp_tsnmap *tsnmap; tsnmap = &ulpq->asoc->peer.tsn_map; while ((skb = skb_peek_tail(list)) != NULL) { event = sctp_skb2event(skb); tsn = event->tsn; /* Don't renege below the Cumulative TSN ACK Point. */ if (TSN_lte(tsn, sctp_tsnmap_get_ctsn(tsnmap))) break; /* Events in ordering queue may have multiple fragments * corresponding to additional TSNs. Sum the total * freed space; find the last TSN. */ freed += skb_headlen(skb); flist = skb_shinfo(skb)->frag_list; for (last = flist; flist; flist = flist->next) { last = flist; freed += skb_headlen(last); } if (last) last_tsn = sctp_skb2event(last)->tsn; else last_tsn = tsn; /* Unlink the event, then renege all applicable TSNs. */ __skb_unlink(skb, list); sctp_ulpevent_free(event); while (TSN_lte(tsn, last_tsn)) { sctp_tsnmap_renege(tsnmap, tsn); tsn++; } if (freed >= needed) return freed; } return freed; } /* Renege 'needed' bytes from the ordering queue. */ static __u16 sctp_ulpq_renege_order(struct sctp_ulpq *ulpq, __u16 needed) { return sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); } /* Renege 'needed' bytes from the reassembly queue. */ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed) { return sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); } /* Partial deliver the first message as there is pressure on rwnd. */ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_association *asoc; struct sctp_sock *sp; __u32 ctsn; struct sk_buff *skb; asoc = ulpq->asoc; sp = sctp_sk(asoc->base.sk); /* If the association is already in Partial Delivery mode * we have nothing to do. */ if (ulpq->pd_mode) return; /* Data must be at or below the Cumulative TSN ACK Point to * start partial delivery. */ skb = skb_peek(&asoc->ulpq.reasm); if (skb != NULL) { ctsn = sctp_skb2event(skb)->tsn; if (!TSN_lte(ctsn, sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map))) return; } /* If the user enabled fragment interleave socket option, * multiple associations can enter partial delivery. * Otherwise, we can only enter partial delivery if the * socket is not in partial deliver mode. */ if (sp->frag_interleave || atomic_read(&sp->pd_mode) == 0) { /* Is partial delivery possible? */ event = sctp_ulpq_retrieve_first(ulpq); /* Send event to the ULP. */ if (event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_ulpq_tail_event(ulpq, &temp); sctp_ulpq_set_pd(ulpq); return; } } } /* Renege some packets to make room for an incoming chunk. */ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_data_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_order(ulpq, needed); if (freed < needed) freed += sctp_ulpq_renege_frags(ulpq, needed - freed); } /* If able to free enough room, accept this chunk. */ if (sk_rmem_schedule(asoc->base.sk, chunk->skb, needed) && freed >= needed) { int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been * delivered; otherwise, drain the reassembly queue. */ if (retval <= 0) sctp_ulpq_partial_delivery(ulpq, gfp); else if (retval == 1) sctp_ulpq_reasm_drain(ulpq); } } /* Notify the application if an association is aborted and in * partial delivery mode. Send up any pending received messages. */ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *ev = NULL; struct sctp_sock *sp; struct sock *sk; if (!ulpq->pd_mode) return; sk = ulpq->asoc->base.sk; sp = sctp_sk(sk); if (sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, 0, 0, 0, gfp); if (ev) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); /* If there is data waiting, send it up the socket now. */ if ((sctp_ulpq_clear_pd(ulpq) || ev) && !sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } }
143 144 144 144 144 148 148 1 126 58 145 7 144 143 28 12 144 144 148 144 144 40 144 144 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * This is a replacement of the old ipt_recent module, which carried the * following copyright notice: * * Author: Stephen Frost <sfrost@snowman.net> * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/inet.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_recent.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_recent"); MODULE_ALIAS("ip6t_recent"); static unsigned int ip_list_tot __read_mostly = 100; static unsigned int ip_list_hash_size __read_mostly; static unsigned int ip_list_perms __read_mostly = 0644; static unsigned int ip_list_uid __read_mostly; static unsigned int ip_list_gid __read_mostly; module_param(ip_list_tot, uint, 0400); module_param(ip_list_hash_size, uint, 0400); module_param(ip_list_perms, uint, 0400); module_param(ip_list_uid, uint, 0644); module_param(ip_list_gid, uint, 0644); MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files"); MODULE_PARM_DESC(ip_list_uid, "default owner of /proc/net/xt_recent/* files"); MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* files"); /* retained for backwards compatibility */ static unsigned int ip_pkt_list_tot __read_mostly; module_param(ip_pkt_list_tot, uint, 0400); MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 65535)"); #define XT_RECENT_MAX_NSTAMPS 65536 struct recent_entry { struct list_head list; struct list_head lru_list; union nf_inet_addr addr; u_int16_t family; u_int8_t ttl; u_int16_t index; u_int16_t nstamps; unsigned long stamps[]; }; struct recent_table { struct list_head list; char name[XT_RECENT_NAME_LEN]; union nf_inet_addr mask; unsigned int refcnt; unsigned int entries; u_int16_t nstamps_max_mask; struct list_head lru_list; struct list_head iphash[]; }; struct recent_net { struct list_head tables; #ifdef CONFIG_PROC_FS struct proc_dir_entry *xt_recent; #endif }; static unsigned int recent_net_id __read_mostly; static inline struct recent_net *recent_pernet(struct net *net) { return net_generic(net, recent_net_id); } static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS static const struct proc_ops recent_mt_proc_ops; #endif static u_int32_t hash_rnd __read_mostly; static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr) { return jhash_1word((__force u32)addr->ip, hash_rnd) & (ip_list_hash_size - 1); } static inline unsigned int recent_entry_hash6(const union nf_inet_addr *addr) { return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) & (ip_list_hash_size - 1); } static struct recent_entry * recent_entry_lookup(const struct recent_table *table, const union nf_inet_addr *addrp, u_int16_t family, u_int8_t ttl) { struct recent_entry *e; unsigned int h; if (family == NFPROTO_IPV4) h = recent_entry_hash4(addrp); else h = recent_entry_hash6(addrp); list_for_each_entry(e, &table->iphash[h], list) if (e->family == family && memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 && (ttl == e->ttl || ttl == 0 || e->ttl == 0)) return e; return NULL; } static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) { list_del(&e->list); list_del(&e->lru_list); kfree(e); t->entries--; } /* * Drop entries with timestamps older then 'time'. */ static void recent_entry_reap(struct recent_table *t, unsigned long time, struct recent_entry *working, bool update) { struct recent_entry *e; /* * The head of the LRU list is always the oldest entry. */ e = list_entry(t->lru_list.next, struct recent_entry, lru_list); /* * Do not reap the entry which are going to be updated. */ if (e == working && update) return; /* * The last time stamp is the most recent. */ if (time_after(time, e->stamps[e->index-1])) recent_entry_remove(t, e); } static struct recent_entry * recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, u_int16_t family, u_int8_t ttl) { struct recent_entry *e; unsigned int nstamps_max = t->nstamps_max_mask; if (t->entries >= ip_list_tot) { e = list_entry(t->lru_list.next, struct recent_entry, lru_list); recent_entry_remove(t, e); } nstamps_max += 1; e = kmalloc(struct_size(e, stamps, nstamps_max), GFP_ATOMIC); if (e == NULL) return NULL; memcpy(&e->addr, addr, sizeof(e->addr)); e->ttl = ttl; e->stamps[0] = jiffies; e->nstamps = 1; e->index = 1; e->family = family; if (family == NFPROTO_IPV4) list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]); else list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]); list_add_tail(&e->lru_list, &t->lru_list); t->entries++; return e; } static void recent_entry_update(struct recent_table *t, struct recent_entry *e) { e->index &= t->nstamps_max_mask; e->stamps[e->index++] = jiffies; if (e->index > e->nstamps) e->nstamps = e->index; list_move_tail(&e->lru_list, &t->lru_list); } static struct recent_table *recent_table_lookup(struct recent_net *recent_net, const char *name) { struct recent_table *t; list_for_each_entry(t, &recent_net->tables, list) if (!strcmp(t->name, name)) return t; return NULL; } static void recent_table_flush(struct recent_table *t) { struct recent_entry *e, *next; unsigned int i; for (i = 0; i < ip_list_hash_size; i++) list_for_each_entry_safe(e, next, &t->iphash[i], list) recent_entry_remove(t, e); } static bool recent_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); struct recent_net *recent_net = recent_pernet(net); const struct xt_recent_mtinfo_v1 *info = par->matchinfo; struct recent_table *t; struct recent_entry *e; union nf_inet_addr addr = {}, addr_mask; u_int8_t ttl; bool ret = info->invert; if (xt_family(par) == NFPROTO_IPV4) { const struct iphdr *iph = ip_hdr(skb); if (info->side == XT_RECENT_DEST) addr.ip = iph->daddr; else addr.ip = iph->saddr; ttl = iph->ttl; } else { const struct ipv6hdr *iph = ipv6_hdr(skb); if (info->side == XT_RECENT_DEST) memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6)); else memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6)); ttl = iph->hop_limit; } /* use TTL as seen before forwarding */ if (xt_out(par) != NULL && (!skb->sk || !net_eq(net, sock_net(skb->sk)))) ttl++; spin_lock_bh(&recent_lock); t = recent_table_lookup(recent_net, info->name); nf_inet_addr_mask(&addr, &addr_mask, &t->mask); e = recent_entry_lookup(t, &addr_mask, xt_family(par), (info->check_set & XT_RECENT_TTL) ? ttl : 0); if (e == NULL) { if (!(info->check_set & XT_RECENT_SET)) goto out; e = recent_entry_init(t, &addr_mask, xt_family(par), ttl); if (e == NULL) par->hotdrop = true; ret = !ret; goto out; } if (info->check_set & XT_RECENT_SET) ret = !ret; else if (info->check_set & XT_RECENT_REMOVE) { recent_entry_remove(t, e); ret = !ret; } else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) { unsigned long time = jiffies - info->seconds * HZ; unsigned int i, hits = 0; for (i = 0; i < e->nstamps; i++) { if (info->seconds && time_after(time, e->stamps[i])) continue; if (!info->hit_count || ++hits >= info->hit_count) { ret = !ret; break; } } /* info->seconds must be non-zero */ if (info->check_set & XT_RECENT_REAP) recent_entry_reap(t, time, e, info->check_set & XT_RECENT_UPDATE && ret); } if (info->check_set & XT_RECENT_SET || (info->check_set & XT_RECENT_UPDATE && ret)) { recent_entry_update(t, e); e->ttl = ttl; } out: spin_unlock_bh(&recent_lock); return ret; } static void recent_table_free(void *addr) { kvfree(addr); } static int recent_mt_check(const struct xt_mtchk_param *par, const struct xt_recent_mtinfo_v1 *info) { struct recent_net *recent_net = recent_pernet(par->net); struct recent_table *t; #ifdef CONFIG_PROC_FS struct proc_dir_entry *pde; kuid_t uid; kgid_t gid; #endif unsigned int nstamp_mask; unsigned int i; int ret = -EINVAL; net_get_random_once(&hash_rnd, sizeof(hash_rnd)); if (info->check_set & ~XT_RECENT_VALID_FLAGS) { pr_info_ratelimited("Unsupported userspace flags (%08x)\n", info->check_set); return -EINVAL; } if (hweight8(info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE | XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1) return -EINVAL; if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) && (info->seconds || info->hit_count || (info->check_set & XT_RECENT_MODIFIERS))) return -EINVAL; if ((info->check_set & XT_RECENT_REAP) && !info->seconds) return -EINVAL; if (info->hit_count >= XT_RECENT_MAX_NSTAMPS) { pr_info_ratelimited("hitcount (%u) is larger than allowed maximum (%u)\n", info->hit_count, XT_RECENT_MAX_NSTAMPS - 1); return -EINVAL; } ret = xt_check_proc_name(info->name, sizeof(info->name)); if (ret) return ret; if (ip_pkt_list_tot && info->hit_count < ip_pkt_list_tot) nstamp_mask = roundup_pow_of_two(ip_pkt_list_tot) - 1; else if (info->hit_count) nstamp_mask = roundup_pow_of_two(info->hit_count) - 1; else nstamp_mask = 32 - 1; mutex_lock(&recent_mutex); t = recent_table_lookup(recent_net, info->name); if (t != NULL) { if (nstamp_mask > t->nstamps_max_mask) { spin_lock_bh(&recent_lock); recent_table_flush(t); t->nstamps_max_mask = nstamp_mask; spin_unlock_bh(&recent_lock); } t->refcnt++; ret = 0; goto out; } t = kvzalloc(struct_size(t, iphash, ip_list_hash_size), GFP_KERNEL); if (t == NULL) { ret = -ENOMEM; goto out; } t->refcnt = 1; t->nstamps_max_mask = nstamp_mask; memcpy(&t->mask, &info->mask, sizeof(t->mask)); strcpy(t->name, info->name); INIT_LIST_HEAD(&t->lru_list); for (i = 0; i < ip_list_hash_size; i++) INIT_LIST_HEAD(&t->iphash[i]); #ifdef CONFIG_PROC_FS uid = make_kuid(&init_user_ns, ip_list_uid); gid = make_kgid(&init_user_ns, ip_list_gid); if (!uid_valid(uid) || !gid_valid(gid)) { recent_table_free(t); ret = -EINVAL; goto out; } pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent, &recent_mt_proc_ops, t); if (pde == NULL) { recent_table_free(t); ret = -ENOMEM; goto out; } proc_set_user(pde, uid, gid); #endif spin_lock_bh(&recent_lock); list_add_tail(&t->list, &recent_net->tables); spin_unlock_bh(&recent_lock); ret = 0; out: mutex_unlock(&recent_mutex); return ret; } static int recent_mt_check_v0(const struct xt_mtchk_param *par) { const struct xt_recent_mtinfo_v0 *info_v0 = par->matchinfo; struct xt_recent_mtinfo_v1 info_v1; /* Copy revision 0 structure to revision 1 */ memcpy(&info_v1, info_v0, sizeof(struct xt_recent_mtinfo)); /* Set default mask to ensure backward compatible behaviour */ memset(info_v1.mask.all, 0xFF, sizeof(info_v1.mask.all)); return recent_mt_check(par, &info_v1); } static int recent_mt_check_v1(const struct xt_mtchk_param *par) { return recent_mt_check(par, par->matchinfo); } static void recent_mt_destroy(const struct xt_mtdtor_param *par) { struct recent_net *recent_net = recent_pernet(par->net); const struct xt_recent_mtinfo_v1 *info = par->matchinfo; struct recent_table *t; mutex_lock(&recent_mutex); t = recent_table_lookup(recent_net, info->name); if (--t->refcnt == 0) { spin_lock_bh(&recent_lock); list_del(&t->list); spin_unlock_bh(&recent_lock); #ifdef CONFIG_PROC_FS if (recent_net->xt_recent != NULL) remove_proc_entry(t->name, recent_net->xt_recent); #endif recent_table_flush(t); recent_table_free(t); } mutex_unlock(&recent_mutex); } #ifdef CONFIG_PROC_FS struct recent_iter_state { const struct recent_table *table; unsigned int bucket; }; static void *recent_seq_start(struct seq_file *seq, loff_t *pos) __acquires(recent_lock) { struct recent_iter_state *st = seq->private; const struct recent_table *t = st->table; struct recent_entry *e; loff_t p = *pos; spin_lock_bh(&recent_lock); for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) list_for_each_entry(e, &t->iphash[st->bucket], list) if (p-- == 0) return e; return NULL; } static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct recent_iter_state *st = seq->private; const struct recent_table *t = st->table; const struct recent_entry *e = v; const struct list_head *head = e->list.next; (*pos)++; while (head == &t->iphash[st->bucket]) { if (++st->bucket >= ip_list_hash_size) return NULL; head = t->iphash[st->bucket].next; } return list_entry(head, struct recent_entry, list); } static void recent_seq_stop(struct seq_file *s, void *v) __releases(recent_lock) { spin_unlock_bh(&recent_lock); } static int recent_seq_show(struct seq_file *seq, void *v) { const struct recent_entry *e = v; struct recent_iter_state *st = seq->private; const struct recent_table *t = st->table; unsigned int i; i = (e->index - 1) & t->nstamps_max_mask; if (e->family == NFPROTO_IPV4) seq_printf(seq, "src=%pI4 ttl: %u last_seen: %lu oldest_pkt: %u", &e->addr.ip, e->ttl, e->stamps[i], e->index); else seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u", &e->addr.in6, e->ttl, e->stamps[i], e->index); for (i = 0; i < e->nstamps; i++) seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); seq_putc(seq, '\n'); return 0; } static const struct seq_operations recent_seq_ops = { .start = recent_seq_start, .next = recent_seq_next, .stop = recent_seq_stop, .show = recent_seq_show, }; static int recent_seq_open(struct inode *inode, struct file *file) { struct recent_iter_state *st; st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); if (st == NULL) return -ENOMEM; st->table = pde_data(inode); return 0; } static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { struct recent_table *t = pde_data(file_inode(file)); struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")]; const char *c = buf; union nf_inet_addr addr = {}; u_int16_t family; bool add, succ; if (size == 0) return 0; if (size > sizeof(buf)) size = sizeof(buf); if (copy_from_user(buf, input, size) != 0) return -EFAULT; /* Strict protocol! */ if (*loff != 0) return -ESPIPE; switch (*c) { case '/': /* flush table */ spin_lock_bh(&recent_lock); recent_table_flush(t); spin_unlock_bh(&recent_lock); return size; case '-': /* remove address */ add = false; break; case '+': /* add address */ add = true; break; default: pr_info_ratelimited("Need \"+ip\", \"-ip\" or \"/\"\n"); return -EINVAL; } ++c; --size; if (strnchr(c, size, ':') != NULL) { family = NFPROTO_IPV6; succ = in6_pton(c, size, (void *)&addr, '\n', NULL); } else { family = NFPROTO_IPV4; succ = in4_pton(c, size, (void *)&addr, '\n', NULL); } if (!succ) return -EINVAL; spin_lock_bh(&recent_lock); e = recent_entry_lookup(t, &addr, family, 0); if (e == NULL) { if (add) recent_entry_init(t, &addr, family, 0); } else { if (add) recent_entry_update(t, e); else recent_entry_remove(t, e); } spin_unlock_bh(&recent_lock); /* Note we removed one above */ *loff += size + 1; return size + 1; } static const struct proc_ops recent_mt_proc_ops = { .proc_open = recent_seq_open, .proc_read = seq_read, .proc_write = recent_mt_proc_write, .proc_release = seq_release_private, .proc_lseek = seq_lseek, }; static int __net_init recent_proc_net_init(struct net *net) { struct recent_net *recent_net = recent_pernet(net); recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net); if (!recent_net->xt_recent) return -ENOMEM; return 0; } static void __net_exit recent_proc_net_exit(struct net *net) { struct recent_net *recent_net = recent_pernet(net); struct recent_table *t; /* recent_net_exit() is called before recent_mt_destroy(). Make sure * that the parent xt_recent proc entry is empty before trying to * remove it. */ spin_lock_bh(&recent_lock); list_for_each_entry(t, &recent_net->tables, list) remove_proc_entry(t->name, recent_net->xt_recent); recent_net->xt_recent = NULL; spin_unlock_bh(&recent_lock); remove_proc_entry("xt_recent", net->proc_net); } #else static inline int recent_proc_net_init(struct net *net) { return 0; } static inline void recent_proc_net_exit(struct net *net) { } #endif /* CONFIG_PROC_FS */ static int __net_init recent_net_init(struct net *net) { struct recent_net *recent_net = recent_pernet(net); INIT_LIST_HEAD(&recent_net->tables); return recent_proc_net_init(net); } static void __net_exit recent_net_exit(struct net *net) { recent_proc_net_exit(net); } static struct pernet_operations recent_net_ops = { .init = recent_net_init, .exit = recent_net_exit, .id = &recent_net_id, .size = sizeof(struct recent_net), }; static struct xt_match recent_mt_reg[] __read_mostly = { { .name = "recent", .revision = 0, .family = NFPROTO_IPV4, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), .checkentry = recent_mt_check_v0, .destroy = recent_mt_destroy, .me = THIS_MODULE, }, { .name = "recent", .revision = 0, .family = NFPROTO_IPV6, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), .checkentry = recent_mt_check_v0, .destroy = recent_mt_destroy, .me = THIS_MODULE, }, { .name = "recent", .revision = 1, .family = NFPROTO_IPV4, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo_v1), .checkentry = recent_mt_check_v1, .destroy = recent_mt_destroy, .me = THIS_MODULE, }, { .name = "recent", .revision = 1, .family = NFPROTO_IPV6, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo_v1), .checkentry = recent_mt_check_v1, .destroy = recent_mt_destroy, .me = THIS_MODULE, } }; static int __init recent_mt_init(void) { int err; BUILD_BUG_ON_NOT_POWER_OF_2(XT_RECENT_MAX_NSTAMPS); if (!ip_list_tot || ip_pkt_list_tot >= XT_RECENT_MAX_NSTAMPS) return -EINVAL; ip_list_hash_size = 1 << fls(ip_list_tot); err = register_pernet_subsys(&recent_net_ops); if (err) return err; err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); if (err) unregister_pernet_subsys(&recent_net_ops); return err; } static void __exit recent_mt_exit(void) { xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); unregister_pernet_subsys(&recent_net_ops); } module_init(recent_mt_init); module_exit(recent_mt_exit);
2555 2553 2552 962 929 829 829 829 1399 4 1393 727 28866 25411 27190 22755 1954 15086 28859 28753 15052 91 94 91 93 2 94 67 95 92 29910 28826 1415 719 25346 29890 617 89 25171 8218 28789 25181 29824 9 12684 29886 898 1831 29877 27 685 90 27 616 72 29902 760 8601 29915 2 4 29979 12047 29955 29916 7 65 7 9104 3 1 9105 29499 29151 9102 145 11 145 29254 29467 10797 29230 2 2 11 11 29381 3 29449 175 173 174 64 63 4 2 2 2 4 4 4 2 4 4 2 2 2369 2369 12 218 18 14 89 115 219 22 1 4 18 22 19 22 18 22 22 18 18 41 41 20 49 8 50 50 1 21 14 14 14 14 266 268 268 268 273 273 273 269 273 270 271 270 25 203 4 42 129 199 76 50 33 17 323 50 272 323 191 193 511 322 193 4508 1334 1297 38 1335 1335 1335 7 428 428 428 428 428 428 428 428 428 428 428 428 8 8 8 8 8 8 8 8 5960 742 22 49 41 1335 4508 63 7 2 428 2 2368 8 1 2 110 31240 35018 920 5 1080 35089 34435 31240 26185 25291 31183 923 31261 587 1080 31164 27398 29571 29902 1 1079 31421 31481 14370 30752 29611 27388 924 1080 12951 1 843 12967 45 12938 29121 5893 5907 1 1442 8682 31408 11961 13668 14042 2 16342 2451 2453 53 15555 9961 9963 1 79 8179 1 1 1 298 69 6 5 80 3 14 3 1450 8824 8 2422 1886 2408 284 3 1952 1810 8 8 1815 122 63 286 3 3 209 209 1411 1688 5 1689 2 1676 1399 243 3 100 4 1394 138 1637 2422 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/vsprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */ /* * Wirzenius wrote this portably, Torvalds fucked it up :-) */ /* * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com> * - changed to provide snprintf and vsnprintf functions * So Feb 1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de> * - scnprintf and vscnprintf */ #include <linux/stdarg.h> #include <linux/build_bug.h> #include <linux/clk.h> #include <linux/clk-provider.h> #include <linux/errname.h> #include <linux/module.h> /* for KSYM_SYMBOL_LEN */ #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/kernel.h> #include <linux/kallsyms.h> #include <linux/math64.h> #include <linux/uaccess.h> #include <linux/ioport.h> #include <linux/dcache.h> #include <linux/cred.h> #include <linux/rtc.h> #include <linux/sprintf.h> #include <linux/time.h> #include <linux/uuid.h> #include <linux/of.h> #include <net/addrconf.h> #include <linux/siphash.h> #include <linux/compiler.h> #include <linux/property.h> #include <linux/notifier.h> #ifdef CONFIG_BLOCK #include <linux/blkdev.h> #endif #include "../mm/internal.h" /* For the trace_print_flags arrays */ #include <asm/page.h> /* for PAGE_SIZE */ #include <asm/byteorder.h> /* cpu_to_le16 */ #include <linux/unaligned.h> #include <linux/string_helpers.h> #include "kstrtox.h" /* Disable pointer hashing if requested */ bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); noinline static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars) { const char *cp; unsigned long long result = 0ULL; size_t prefix_chars; unsigned int rv; cp = _parse_integer_fixup_radix(startp, &base); prefix_chars = cp - startp; if (prefix_chars < max_chars) { rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars); /* FIXME */ cp += (rv & ~KSTRTOX_OVERFLOW); } else { /* Field too short for prefix + digit, skip over without converting */ cp = startp + max_chars; } if (endp) *endp = (char *)cp; return result; } /** * simple_strtoull - convert a string to an unsigned long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoull instead. */ noinline unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { return simple_strntoull(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoull); /** * simple_strtoul - convert a string to an unsigned long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoul instead. */ unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base) { return simple_strtoull(cp, endp, base); } EXPORT_SYMBOL(simple_strtoul); unsigned long simple_strntoul(const char *cp, char **endp, unsigned int base, size_t max_chars) { return simple_strntoull(cp, endp, base, max_chars); } EXPORT_SYMBOL(simple_strntoul); /** * simple_strtol - convert a string to a signed long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtol instead. */ long simple_strtol(const char *cp, char **endp, unsigned int base) { if (*cp == '-') return -simple_strtoul(cp + 1, endp, base); return simple_strtoul(cp, endp, base); } EXPORT_SYMBOL(simple_strtol); noinline static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars) { /* * simple_strntoull() safely handles receiving max_chars==0 in the * case cp[0] == '-' && max_chars == 1. * If max_chars == 0 we can drop through and pass it to simple_strntoull() * and the content of *cp is irrelevant. */ if (*cp == '-' && max_chars > 0) return -simple_strntoull(cp + 1, endp, base, max_chars - 1); return simple_strntoull(cp, endp, base, max_chars); } /** * simple_strtoll - convert a string to a signed long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoll instead. */ long long simple_strtoll(const char *cp, char **endp, unsigned int base) { return simple_strntoll(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoll); static inline int skip_atoi(const char **s) { int i = 0; do { i = i*10 + *((*s)++) - '0'; } while (isdigit(**s)); return i; } /* * Decimal conversion is by far the most typical, and is used for * /proc and /sys data. This directly impacts e.g. top performance * with many processes running. We optimize it for speed by emitting * two characters at a time, using a 200 byte lookup table. This * roughly halves the number of multiplications compared to computing * the digits one at a time. Implementation strongly inspired by the * previous version, which in turn used ideas described at * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission * from the author, Douglas W. Jones). * * It turns out there is precisely one 26 bit fixed-point * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual * range happens to be somewhat larger (x <= 1073741898), but that's * irrelevant for our purpose. * * For dividing a number in the range [10^4, 10^6-1] by 100, we still * need a 32x32->64 bit multiply, so we simply use the same constant. * * For dividing a number in the range [100, 10^4-1] by 100, there are * several options. The simplest is (x * 0x147b) >> 19, which is valid * for all x <= 43698. */ static const u16 decpair[100] = { #define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030) _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9), _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19), _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29), _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39), _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49), _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59), _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69), _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79), _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89), _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99), #undef _ }; /* * This will print a single '0' even if r == 0, since we would * immediately jump to out_r where two 0s would be written but only * one of them accounted for in buf. This is needed by ip4_string * below. All other callers pass a non-zero value of r. */ static noinline_for_stack char *put_dec_trunc8(char *buf, unsigned r) { unsigned q; /* 1 <= r < 10^8 */ if (r < 100) goto out_r; /* 100 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 1 <= q < 10^6 */ if (q < 100) goto out_q; /* 100 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 1 <= r < 10^4 */ if (r < 100) goto out_r; /* 100 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; out_q: /* 1 <= q < 100 */ r = q; out_r: /* 1 <= r < 100 */ *((u16 *)buf) = decpair[r]; buf += r < 10 ? 1 : 2; return buf; } #if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64 static noinline_for_stack char *put_dec_full8(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; buf += 2; return buf; } static noinline_for_stack char *put_dec(char *buf, unsigned long long n) { if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n <= 1.6e11 */ if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n < 1e8 */ return put_dec_trunc8(buf, n); } #elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64 static void put_dec_full4(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; } /* * Call put_dec_full4 on x % 10000, return x / 10000. * The approximation x/10000 == (x * 0x346DC5D7) >> 43 * holds for all x < 1,128,869,999. The largest value this * helper will ever be asked to convert is 1,125,520,955. * (second call in the put_dec code, assuming n is all-ones). */ static noinline_for_stack unsigned put_dec_helper4(char *buf, unsigned x) { uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; put_dec_full4(buf, x - q * 10000); return q; } /* Based on code by Douglas W. Jones found at * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> * (with permission from the author). * Performs no 64-bit division and hence should be fast on 32-bit machines. */ static char *put_dec(char *buf, unsigned long long n) { uint32_t d3, d2, d1, q, h; if (n < 100*1000*1000) return put_dec_trunc8(buf, n); d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ h = (n >> 32); d2 = (h ) & 0xffff; d3 = (h >> 16); /* implicit "& 0xffff" */ /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); q = put_dec_helper4(buf, q); q += 7671 * d3 + 9496 * d2 + 6 * d1; q = put_dec_helper4(buf+4, q); q += 4749 * d3 + 42 * d2; q = put_dec_helper4(buf+8, q); q += 281 * d3; buf += 12; if (q) buf = put_dec_trunc8(buf, q); else while (buf[-1] == '0') --buf; return buf; } #endif /* * Convert passed number to decimal string. * Returns the length of string. On buffer overflow, returns 0. * * If speed is not important, use snprintf(). It's easy to read the code. */ int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[sizeof(num) * 3] __aligned(2); int idx, len; /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ if (num <= 9) { tmp[0] = '0' + num; len = 1; } else { len = put_dec(tmp, num) - tmp; } if (len > size || width > size) return 0; if (width > len) { width = width - len; for (idx = 0; idx < width; idx++) buf[idx] = ' '; } else { width = 0; } for (idx = 0; idx < len; ++idx) buf[idx + width] = tmp[len - idx - 1]; return len + width; } #define SIGN 1 /* unsigned/signed */ #define LEFT 2 /* left justified */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define ZEROPAD 16 /* pad with zero, must be 16 == '0' - ' ' */ #define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */ #define SPECIAL 64 /* prefix hex with "0x", octal with "0" */ static_assert(ZEROPAD == ('0' - ' ')); static_assert(SMALL == ('a' ^ 'A')); enum format_state { FORMAT_STATE_NONE, /* Just a string part */ FORMAT_STATE_NUM, FORMAT_STATE_WIDTH, FORMAT_STATE_PRECISION, FORMAT_STATE_CHAR, FORMAT_STATE_STR, FORMAT_STATE_PTR, FORMAT_STATE_PERCENT_CHAR, FORMAT_STATE_INVALID, }; struct printf_spec { unsigned char flags; /* flags to number() */ unsigned char base; /* number base, 8, 10 or 16 only */ short precision; /* # of digits/chars */ int field_width; /* width of output field */ } __packed; static_assert(sizeof(struct printf_spec) == 8); #define FIELD_WIDTH_MAX ((1 << 23) - 1) #define PRECISION_MAX ((1 << 15) - 1) static noinline_for_stack char *number(char *buf, char *end, unsigned long long num, struct printf_spec spec) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[3 * sizeof(num)] __aligned(2); char sign; char locase; int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); int i; bool is_zero = num == 0LL; int field_width = spec.field_width; int precision = spec.precision; /* locase = 0 or 0x20. ORing digits or letters with 'locase' * produces same digits or (maybe lowercased) letters */ locase = (spec.flags & SMALL); if (spec.flags & LEFT) spec.flags &= ~ZEROPAD; sign = 0; if (spec.flags & SIGN) { if ((signed long long)num < 0) { sign = '-'; num = -(signed long long)num; field_width--; } else if (spec.flags & PLUS) { sign = '+'; field_width--; } else if (spec.flags & SPACE) { sign = ' '; field_width--; } } if (need_pfx) { if (spec.base == 16) field_width -= 2; else if (!is_zero) field_width--; } /* generate full string in tmp[], in reverse order */ i = 0; if (num < spec.base) tmp[i++] = hex_asc_upper[num] | locase; else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; if (spec.base == 16) shift = 4; do { tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); num >>= shift; } while (num); } else { /* base 10 */ i = put_dec(tmp, num) - tmp; } /* printing 100 using %2d gives "100", not "00" */ if (i > precision) precision = i; /* leading space padding */ field_width -= precision; if (!(spec.flags & (ZEROPAD | LEFT))) { while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } } /* sign */ if (sign) { if (buf < end) *buf = sign; ++buf; } /* "0x" / "0" prefix */ if (need_pfx) { if (spec.base == 16 || !is_zero) { if (buf < end) *buf = '0'; ++buf; } if (spec.base == 16) { if (buf < end) *buf = ('X' | locase); ++buf; } } /* zero or space padding */ if (!(spec.flags & LEFT)) { char c = ' ' + (spec.flags & ZEROPAD); while (--field_width >= 0) { if (buf < end) *buf = c; ++buf; } } /* hmm even more zero padding? */ while (i <= --precision) { if (buf < end) *buf = '0'; ++buf; } /* actual digits of result */ while (--i >= 0) { if (buf < end) *buf = tmp[i]; ++buf; } /* trailing space padding */ while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } return buf; } static noinline_for_stack char *special_hex_number(char *buf, char *end, unsigned long long num, int size) { struct printf_spec spec; spec.field_width = 2 + 2 * size; /* 0x + hex */ spec.flags = SPECIAL | SMALL | ZEROPAD; spec.base = 16; spec.precision = -1; return number(buf, end, num, spec); } static void move_right(char *buf, char *end, unsigned len, unsigned spaces) { size_t size; if (buf >= end) /* nowhere to put anything */ return; size = end - buf; if (size <= spaces) { memset(buf, ' ', size); return; } if (len) { if (len > size - spaces) len = size - spaces; memmove(buf + spaces, buf, len); } memset(buf, ' ', spaces); } /* * Handle field width padding for a string. * @buf: current buffer position * @n: length of string * @end: end of output buffer * @spec: for field width and flags * Returns: new buffer position after padding. */ static noinline_for_stack char *widen_string(char *buf, int n, char *end, struct printf_spec spec) { unsigned spaces; if (likely(n >= spec.field_width)) return buf; /* we want to pad the sucker */ spaces = spec.field_width - n; if (!(spec.flags & LEFT)) { move_right(buf - n, end, n, spaces); return buf + spaces; } while (spaces--) { if (buf < end) *buf = ' '; ++buf; } return buf; } /* Handle string from a well known address. */ static char *string_nocheck(char *buf, char *end, const char *s, struct printf_spec spec) { int len = 0; int lim = spec.precision; while (lim--) { char c = *s++; if (!c) break; if (buf < end) *buf = c; ++buf; ++len; } return widen_string(buf, len, end, spec); } static char *err_ptr(char *buf, char *end, void *ptr, struct printf_spec spec) { int err = PTR_ERR(ptr); const char *sym = errname(err); if (sym) return string_nocheck(buf, end, sym, spec); /* * Somebody passed ERR_PTR(-1234) or some other non-existing * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to * printing it as its decimal representation. */ spec.flags |= SIGN; spec.base = 10; return number(buf, end, err, spec); } /* Be careful: error messages must fit into the given buffer. */ static char *error_string(char *buf, char *end, const char *s, struct printf_spec spec) { /* * Hard limit to avoid a completely insane messages. It actually * works pretty well because most error messages are in * the many pointer format modifiers. */ if (spec.precision == -1) spec.precision = 2 * sizeof(void *); return string_nocheck(buf, end, s, spec); } /* * Do not call any complex external code here. Nested printk()/vsprintf() * might cause infinite loops. Failures might break printk() and would * be hard to debug. */ static const char *check_pointer_msg(const void *ptr) { if (!ptr) return "(null)"; if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr)) return "(efault)"; return NULL; } static int check_pointer(char **buf, char *end, const void *ptr, struct printf_spec spec) { const char *err_msg; err_msg = check_pointer_msg(ptr); if (err_msg) { *buf = error_string(*buf, end, err_msg, spec); return -EFAULT; } return 0; } static noinline_for_stack char *string(char *buf, char *end, const char *s, struct printf_spec spec) { if (check_pointer(&buf, end, s, spec)) return buf; return string_nocheck(buf, end, s, spec); } static char *pointer_string(char *buf, char *end, const void *ptr, struct printf_spec spec) { spec.base = 16; spec.flags |= SMALL; if (spec.field_width == -1) { spec.field_width = 2 * sizeof(ptr); spec.flags |= ZEROPAD; } return number(buf, end, (unsigned long int)ptr, spec); } /* Make pointers available for printing early in the boot sequence. */ static int debug_boot_weak_hash __ro_after_init; static int __init debug_boot_weak_hash_enable(char *str) { debug_boot_weak_hash = 1; pr_info("debug_boot_weak_hash enabled\n"); return 0; } early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); static bool filled_random_ptr_key __read_mostly; static siphash_key_t ptr_key __read_mostly; static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data) { get_random_bytes(&ptr_key, sizeof(ptr_key)); /* Pairs with smp_rmb() before reading ptr_key. */ smp_wmb(); WRITE_ONCE(filled_random_ptr_key, true); return NOTIFY_DONE; } static int __init vsprintf_init_hashval(void) { static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key }; execute_with_initialized_rng(&fill_ptr_key_nb); return 0; } subsys_initcall(vsprintf_init_hashval) /* Maps a pointer to a 32 bit unique identifier. */ static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { unsigned long hashval; if (!READ_ONCE(filled_random_ptr_key)) return -EBUSY; /* Pairs with smp_wmb() after writing ptr_key. */ smp_rmb(); #ifdef CONFIG_64BIT hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); /* * Mask off the first 32 bits, this makes explicit that we have * modified the address (and 32 bits is plenty for a unique ID). */ hashval = hashval & 0xffffffff; #else hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); #endif *hashval_out = hashval; return 0; } int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { return __ptr_to_hashval(ptr, hashval_out); } static char *ptr_to_id(char *buf, char *end, const void *ptr, struct printf_spec spec) { const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; unsigned long hashval; int ret; /* * Print the real pointer value for NULL and error pointers, * as they are not actual addresses. */ if (IS_ERR_OR_NULL(ptr)) return pointer_string(buf, end, ptr, spec); /* When debugging early boot use non-cryptographically secure hash. */ if (unlikely(debug_boot_weak_hash)) { hashval = hash_long((unsigned long)ptr, 32); return pointer_string(buf, end, (const void *)hashval, spec); } ret = __ptr_to_hashval(ptr, &hashval); if (ret) { spec.field_width = 2 * sizeof(ptr); /* string length must be less than default_width */ return error_string(buf, end, str, spec); } return pointer_string(buf, end, (const void *)hashval, spec); } static char *default_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { /* * default is to _not_ leak addresses, so hash before printing, * unless no_hash_pointers is specified on the command line. */ if (unlikely(no_hash_pointers)) return pointer_string(buf, end, ptr, spec); return ptr_to_id(buf, end, ptr, spec); } int kptr_restrict __read_mostly; static noinline_for_stack char *restricted_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { switch (kptr_restrict) { case 0: /* Handle as %p, hash and do _not_ leak addresses. */ return default_pointer(buf, end, ptr, spec); case 1: { const struct cred *cred; /* * kptr_restrict==1 cannot be used in IRQ context * because its test for CAP_SYSLOG would be meaningless. */ if (in_hardirq() || in_serving_softirq() || in_nmi()) { if (spec.field_width == -1) spec.field_width = 2 * sizeof(ptr); return error_string(buf, end, "pK-error", spec); } /* * Only print the real pointer value if the current * process has CAP_SYSLOG and is running with the * same credentials it started with. This is because * access to files is checked at open() time, but %pK * checks permission at read() time. We don't want to * leak pointer values if a binary opens a file using * %pK and then elevates privileges before reading it. */ cred = current_cred(); if (!has_capability_noaudit(current, CAP_SYSLOG) || !uid_eq(cred->euid, cred->uid) || !gid_eq(cred->egid, cred->gid)) ptr = NULL; break; } case 2: default: /* Always print 0's for %pK */ ptr = NULL; break; } return pointer_string(buf, end, ptr, spec); } static noinline_for_stack char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, const char *fmt) { const char *array[4], *s; const struct dentry *p; int depth; int i, n; switch (fmt[1]) { case '2': case '3': case '4': depth = fmt[1] - '0'; break; default: depth = 1; } rcu_read_lock(); for (i = 0; i < depth; i++, d = p) { if (check_pointer(&buf, end, d, spec)) { rcu_read_unlock(); return buf; } p = READ_ONCE(d->d_parent); array[i] = READ_ONCE(d->d_name.name); if (p == d) { if (i) array[i] = ""; i++; break; } } s = array[--i]; for (n = 0; n != spec.precision; n++, buf++) { char c = *s++; if (!c) { if (!i) break; c = '/'; s = array[--i]; } if (buf < end) *buf = c; } rcu_read_unlock(); return widen_string(buf, n, end, spec); } static noinline_for_stack char *file_dentry_name(char *buf, char *end, const struct file *f, struct printf_spec spec, const char *fmt) { if (check_pointer(&buf, end, f, spec)) return buf; return dentry_name(buf, end, f->f_path.dentry, spec, fmt); } #ifdef CONFIG_BLOCK static noinline_for_stack char *bdev_name(char *buf, char *end, struct block_device *bdev, struct printf_spec spec, const char *fmt) { struct gendisk *hd; if (check_pointer(&buf, end, bdev, spec)) return buf; hd = bdev->bd_disk; buf = string(buf, end, hd->disk_name, spec); if (bdev_is_partition(bdev)) { if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) { if (buf < end) *buf = 'p'; buf++; } buf = number(buf, end, bdev_partno(bdev), spec); } return buf; } #endif static noinline_for_stack char *symbol_string(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { unsigned long value; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; #endif if (fmt[1] == 'R') ptr = __builtin_extract_return_addr(ptr); value = (unsigned long)ptr; #ifdef CONFIG_KALLSYMS if (*fmt == 'B' && fmt[1] == 'b') sprint_backtrace_build_id(sym, value); else if (*fmt == 'B') sprint_backtrace(sym, value); else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b'))) sprint_symbol_build_id(sym, value); else if (*fmt != 's') sprint_symbol(sym, value); else sprint_symbol_no_offset(sym, value); return string_nocheck(buf, end, sym, spec); #else return special_hex_number(buf, end, value, sizeof(void *)); #endif } static const struct printf_spec default_str_spec = { .field_width = -1, .precision = -1, }; static const struct printf_spec default_flag_spec = { .base = 16, .precision = -1, .flags = SPECIAL | SMALL, }; static const struct printf_spec default_dec_spec = { .base = 10, .precision = -1, }; static const struct printf_spec default_dec02_spec = { .base = 10, .field_width = 2, .precision = -1, .flags = ZEROPAD, }; static const struct printf_spec default_dec04_spec = { .base = 10, .field_width = 4, .precision = -1, .flags = ZEROPAD, }; static noinline_for_stack char *hex_range(char *buf, char *end, u64 start_val, u64 end_val, struct printf_spec spec) { buf = number(buf, end, start_val, spec); if (start_val == end_val) return buf; if (buf < end) *buf = '-'; ++buf; return number(buf, end, end_val, spec); } static noinline_for_stack char *resource_string(char *buf, char *end, struct resource *res, struct printf_spec spec, const char *fmt) { #ifndef IO_RSRC_PRINTK_SIZE #define IO_RSRC_PRINTK_SIZE 6 #endif #ifndef MEM_RSRC_PRINTK_SIZE #define MEM_RSRC_PRINTK_SIZE 10 #endif static const struct printf_spec io_spec = { .base = 16, .field_width = IO_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec mem_spec = { .base = 16, .field_width = MEM_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec bus_spec = { .base = 16, .field_width = 2, .precision = -1, .flags = SMALL | ZEROPAD, }; static const struct printf_spec str_spec = { .field_width = -1, .precision = 10, .flags = LEFT, }; /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8) * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */ #define RSRC_BUF_SIZE ((2 * sizeof(resource_size_t)) + 4) #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; char *p = sym, *pend = sym + sizeof(sym); int decode = (fmt[0] == 'R') ? 1 : 0; const struct printf_spec *specp; if (check_pointer(&buf, end, res, spec)) return buf; *p++ = '['; if (res->flags & IORESOURCE_IO) { p = string_nocheck(p, pend, "io ", str_spec); specp = &io_spec; } else if (res->flags & IORESOURCE_MEM) { p = string_nocheck(p, pend, "mem ", str_spec); specp = &mem_spec; } else if (res->flags & IORESOURCE_IRQ) { p = string_nocheck(p, pend, "irq ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_DMA) { p = string_nocheck(p, pend, "dma ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_BUS) { p = string_nocheck(p, pend, "bus ", str_spec); specp = &bus_spec; } else { p = string_nocheck(p, pend, "??? ", str_spec); specp = &mem_spec; decode = 0; } if (decode && res->flags & IORESOURCE_UNSET) { p = string_nocheck(p, pend, "size ", str_spec); p = number(p, pend, resource_size(res), *specp); } else { p = hex_range(p, pend, res->start, res->end, *specp); } if (decode) { if (res->flags & IORESOURCE_MEM_64) p = string_nocheck(p, pend, " 64bit", str_spec); if (res->flags & IORESOURCE_PREFETCH) p = string_nocheck(p, pend, " pref", str_spec); if (res->flags & IORESOURCE_WINDOW) p = string_nocheck(p, pend, " window", str_spec); if (res->flags & IORESOURCE_DISABLED) p = string_nocheck(p, pend, " disabled", str_spec); } else { p = string_nocheck(p, pend, " flags ", str_spec); p = number(p, pend, res->flags, default_flag_spec); } *p++ = ']'; *p = '\0'; return string_nocheck(buf, end, sym, spec); } static noinline_for_stack char *range_string(char *buf, char *end, const struct range *range, struct printf_spec spec, const char *fmt) { char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")]; char *p = sym, *pend = sym + sizeof(sym); struct printf_spec range_spec = { .field_width = 2 + 2 * sizeof(range->start), /* 0x + 2 * 8 */ .flags = SPECIAL | SMALL | ZEROPAD, .base = 16, .precision = -1, }; if (check_pointer(&buf, end, range, spec)) return buf; p = string_nocheck(p, pend, "[range ", default_str_spec); p = hex_range(p, pend, range->start, range->end, range_spec); *p++ = ']'; *p = '\0'; return string_nocheck(buf, end, sym, spec); } static noinline_for_stack char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { int i, len = 1; /* if we pass '%ph[CDN]', field width remains negative value, fallback to the default */ char separator; if (spec.field_width == 0) /* nothing to print */ return buf; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'C': separator = ':'; break; case 'D': separator = '-'; break; case 'N': separator = 0; break; default: separator = ' '; break; } if (spec.field_width > 0) len = min_t(int, spec.field_width, 64); for (i = 0; i < len; ++i) { if (buf < end) *buf = hex_asc_hi(addr[i]); ++buf; if (buf < end) *buf = hex_asc_lo(addr[i]); ++buf; if (separator && i != len - 1) { if (buf < end) *buf = separator; ++buf; } } return buf; } static noinline_for_stack char *bitmap_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { const int CHUNKSZ = 32; int nr_bits = max_t(int, spec.field_width, 0); int i, chunksz; bool first = true; if (check_pointer(&buf, end, bitmap, spec)) return buf; /* reused to print numbers */ spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 }; chunksz = nr_bits & (CHUNKSZ - 1); if (chunksz == 0) chunksz = CHUNKSZ; i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ; for (; i >= 0; i -= CHUNKSZ) { u32 chunkmask, val; int word, bit; chunkmask = ((1ULL << chunksz) - 1); word = i / BITS_PER_LONG; bit = i % BITS_PER_LONG; val = (bitmap[word] >> bit) & chunkmask; if (!first) { if (buf < end) *buf = ','; buf++; } first = false; spec.field_width = DIV_ROUND_UP(chunksz, 4); buf = number(buf, end, val, spec); chunksz = CHUNKSZ; } return buf; } static noinline_for_stack char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { int nr_bits = max_t(int, spec.field_width, 0); bool first = true; int rbot, rtop; if (check_pointer(&buf, end, bitmap, spec)) return buf; for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { if (!first) { if (buf < end) *buf = ','; buf++; } first = false; buf = number(buf, end, rbot, default_dec_spec); if (rtop == rbot + 1) continue; if (buf < end) *buf = '-'; buf = number(++buf, end, rtop - 1, default_dec_spec); } return buf; } static noinline_for_stack char *mac_address_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")]; char *p = mac_addr; int i; char separator; bool reversed = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': separator = '-'; break; case 'R': reversed = true; fallthrough; default: separator = ':'; break; } for (i = 0; i < 6; i++) { if (reversed) p = hex_byte_pack(p, addr[5 - i]); else p = hex_byte_pack(p, addr[i]); if (fmt[0] == 'M' && i != 5) *p++ = separator; } *p = '\0'; return string_nocheck(buf, end, mac_addr, spec); } static noinline_for_stack char *ip4_string(char *p, const u8 *addr, const char *fmt) { int i; bool leading_zeros = (fmt[0] == 'i'); int index; int step; switch (fmt[2]) { case 'h': #ifdef __BIG_ENDIAN index = 0; step = 1; #else index = 3; step = -1; #endif break; case 'l': index = 3; step = -1; break; case 'n': case 'b': default: index = 0; step = 1; break; } for (i = 0; i < 4; i++) { char temp[4] __aligned(2); /* hold each IP quad in reverse order */ int digits = put_dec_trunc8(temp, addr[index]) - temp; if (leading_zeros) { if (digits < 3) *p++ = '0'; if (digits < 2) *p++ = '0'; } /* reverse the digits in the quad */ while (digits--) *p++ = temp[digits]; if (i < 3) *p++ = '.'; index += step; } *p = '\0'; return p; } static noinline_for_stack char *ip6_compressed_string(char *p, const char *addr) { int i, j, range; unsigned char zerolength[8]; int longest = 1; int colonpos = -1; u16 word; u8 hi, lo; bool needcolon = false; bool useIPv4; struct in6_addr in6; memcpy(&in6, addr, sizeof(struct in6_addr)); useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6); memset(zerolength, 0, sizeof(zerolength)); if (useIPv4) range = 6; else range = 8; /* find position of longest 0 run */ for (i = 0; i < range; i++) { for (j = i; j < range; j++) { if (in6.s6_addr16[j] != 0) break; zerolength[i]++; } } for (i = 0; i < range; i++) { if (zerolength[i] > longest) { longest = zerolength[i]; colonpos = i; } } if (longest == 1) /* don't compress a single 0 */ colonpos = -1; /* emit address */ for (i = 0; i < range; i++) { if (i == colonpos) { if (needcolon || i == 0) *p++ = ':'; *p++ = ':'; needcolon = false; i += longest - 1; continue; } if (needcolon) { *p++ = ':'; needcolon = false; } /* hex u16 without leading 0s */ word = ntohs(in6.s6_addr16[i]); hi = word >> 8; lo = word & 0xff; if (hi) { if (hi > 0x0f) p = hex_byte_pack(p, hi); else *p++ = hex_asc_lo(hi); p = hex_byte_pack(p, lo); } else if (lo > 0x0f) p = hex_byte_pack(p, lo); else *p++ = hex_asc_lo(lo); needcolon = true; } if (useIPv4) { if (needcolon) *p++ = ':'; p = ip4_string(p, &in6.s6_addr[12], "I4"); } *p = '\0'; return p; } static noinline_for_stack char *ip6_string(char *p, const char *addr, const char *fmt) { int i; for (i = 0; i < 8; i++) { p = hex_byte_pack(p, *addr++); p = hex_byte_pack(p, *addr++); if (fmt[0] == 'I' && i != 7) *p++ = ':'; } *p = '\0'; return p; } static noinline_for_stack char *ip6_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")]; if (fmt[0] == 'I' && fmt[2] == 'c') ip6_compressed_string(ip6_addr, addr); else ip6_string(ip6_addr, addr, fmt); return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip4_addr[sizeof("255.255.255.255")]; ip4_string(ip4_addr, addr, fmt); return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, struct printf_spec spec, const char *fmt) { bool have_p = false, have_s = false, have_f = false, have_c = false; char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") + sizeof(":12345") + sizeof("/123456789") + sizeof("%1234567890")]; char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr); const u8 *addr = (const u8 *) &sa->sin6_addr; char fmt6[2] = { fmt[0], '6' }; u8 off = 0; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'f': have_f = true; break; case 's': have_s = true; break; case 'c': have_c = true; break; } } if (have_p || have_s || have_f) { *p = '['; off = 1; } if (fmt6[0] == 'I' && have_c) p = ip6_compressed_string(ip6_addr + off, addr); else p = ip6_string(ip6_addr + off, addr, fmt6); if (have_p || have_s || have_f) *p++ = ']'; if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin6_port), spec); } if (have_f) { *p++ = '/'; p = number(p, pend, ntohl(sa->sin6_flowinfo & IPV6_FLOWINFO_MASK), spec); } if (have_s) { *p++ = '%'; p = number(p, pend, sa->sin6_scope_id, spec); } *p = '\0'; return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, struct printf_spec spec, const char *fmt) { bool have_p = false; char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")]; char *pend = ip4_addr + sizeof(ip4_addr); const u8 *addr = (const u8 *) &sa->sin_addr.s_addr; char fmt4[3] = { fmt[0], '4', 0 }; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'h': case 'l': case 'n': case 'b': fmt4[2] = *fmt; break; } } p = ip4_string(ip4_addr, addr, fmt4); if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin_port), spec); } *p = '\0'; return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip_addr_string(char *buf, char *end, const void *ptr, struct printf_spec spec, const char *fmt) { char *err_fmt_msg; if (check_pointer(&buf, end, ptr, spec)) return buf; switch (fmt[1]) { case '6': return ip6_addr_string(buf, end, ptr, spec, fmt); case '4': return ip4_addr_string(buf, end, ptr, spec, fmt); case 'S': { const union { struct sockaddr raw; struct sockaddr_in v4; struct sockaddr_in6 v6; } *sa = ptr; switch (sa->raw.sa_family) { case AF_INET: return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt); case AF_INET6: return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt); default: return error_string(buf, end, "(einval)", spec); }} } err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)"; return error_string(buf, end, err_fmt_msg, spec); } static noinline_for_stack char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { bool found = true; int count = 1; unsigned int flags = 0; int len; if (spec.field_width == 0) return buf; /* nothing to print */ if (check_pointer(&buf, end, addr, spec)) return buf; do { switch (fmt[count++]) { case 'a': flags |= ESCAPE_ANY; break; case 'c': flags |= ESCAPE_SPECIAL; break; case 'h': flags |= ESCAPE_HEX; break; case 'n': flags |= ESCAPE_NULL; break; case 'o': flags |= ESCAPE_OCTAL; break; case 'p': flags |= ESCAPE_NP; break; case 's': flags |= ESCAPE_SPACE; break; default: found = false; break; } } while (found); if (!flags) flags = ESCAPE_ANY_NP; len = spec.field_width < 0 ? 1 : spec.field_width; /* * string_escape_mem() writes as many characters as it can to * the given buffer, and returns the total size of the output * had the buffer been big enough. */ buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL); return buf; } #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wsuggest-attribute=format" #endif static char *va_format(char *buf, char *end, struct va_format *va_fmt, struct printf_spec spec) { va_list va; if (check_pointer(&buf, end, va_fmt, spec)) return buf; va_copy(va, *va_fmt->va); buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va); va_end(va); return buf; } #pragma GCC diagnostic pop static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char uuid[UUID_STRING_LEN + 1]; char *p = uuid; int i; const u8 *index = uuid_index; bool uc = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (*(++fmt)) { case 'L': uc = true; fallthrough; case 'l': index = guid_index; break; case 'B': uc = true; break; } for (i = 0; i < 16; i++) { if (uc) p = hex_byte_pack_upper(p, addr[index[i]]); else p = hex_byte_pack(p, addr[index[i]]); switch (i) { case 3: case 5: case 7: case 9: *p++ = '-'; break; } } *p = 0; return string_nocheck(buf, end, uuid, spec); } static noinline_for_stack char *netdev_bits(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': num = *(const netdev_features_t *)addr; size = sizeof(netdev_features_t); break; default: return error_string(buf, end, "(%pN?)", spec); } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *fourcc_string(char *buf, char *end, const u32 *fourcc, struct printf_spec spec, const char *fmt) { char output[sizeof("0123 little-endian (0x01234567)")]; char *p = output; unsigned int i; bool pixel_fmt = false; u32 orig, val; if (fmt[1] != 'c') return error_string(buf, end, "(%p4?)", spec); if (check_pointer(&buf, end, fourcc, spec)) return buf; orig = get_unaligned(fourcc); switch (fmt[2]) { case 'h': if (fmt[3] == 'R') orig = swab32(orig); break; case 'l': orig = (__force u32)cpu_to_le32(orig); break; case 'b': orig = (__force u32)cpu_to_be32(orig); break; case 'c': /* Pixel formats are printed LSB-first */ pixel_fmt = true; break; default: return error_string(buf, end, "(%p4?)", spec); } val = pixel_fmt ? swab32(orig & ~BIT(31)) : orig; for (i = 0; i < sizeof(u32); i++) { unsigned char c = val >> ((3 - i) * 8); /* Print non-control ASCII characters as-is, dot otherwise */ *p++ = isascii(c) && isprint(c) ? c : '.'; } if (pixel_fmt) { *p++ = ' '; strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian"); p += strlen(p); } *p++ = ' '; *p++ = '('; p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32)); *p++ = ')'; *p = '\0'; return string(buf, end, output, spec); } static noinline_for_stack char *address_val(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'd': num = *(const dma_addr_t *)addr; size = sizeof(dma_addr_t); break; case 'p': default: num = *(const phys_addr_t *)addr; size = sizeof(phys_addr_t); break; } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r) { int year = tm->tm_year + (r ? 0 : 1900); int mon = tm->tm_mon + (r ? 0 : 1); buf = number(buf, end, year, default_dec04_spec); if (buf < end) *buf = '-'; buf++; buf = number(buf, end, mon, default_dec02_spec); if (buf < end) *buf = '-'; buf++; return number(buf, end, tm->tm_mday, default_dec02_spec); } static noinline_for_stack char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r) { buf = number(buf, end, tm->tm_hour, default_dec02_spec); if (buf < end) *buf = ':'; buf++; buf = number(buf, end, tm->tm_min, default_dec02_spec); if (buf < end) *buf = ':'; buf++; return number(buf, end, tm->tm_sec, default_dec02_spec); } static noinline_for_stack char *rtc_str(char *buf, char *end, const struct rtc_time *tm, struct printf_spec spec, const char *fmt) { bool have_t = true, have_d = true; bool raw = false, iso8601_separator = true; bool found = true; int count = 2; if (check_pointer(&buf, end, tm, spec)) return buf; switch (fmt[count]) { case 'd': have_t = false; count++; break; case 't': have_d = false; count++; break; } do { switch (fmt[count++]) { case 'r': raw = true; break; case 's': iso8601_separator = false; break; default: found = false; break; } } while (found); if (have_d) buf = date_str(buf, end, tm, raw); if (have_d && have_t) { if (buf < end) *buf = iso8601_separator ? 'T' : ' '; buf++; } if (have_t) buf = time_str(buf, end, tm, raw); return buf; } static noinline_for_stack char *time64_str(char *buf, char *end, const time64_t time, struct printf_spec spec, const char *fmt) { struct rtc_time rtc_time; struct tm tm; time64_to_tm(time, 0, &tm); rtc_time.tm_sec = tm.tm_sec; rtc_time.tm_min = tm.tm_min; rtc_time.tm_hour = tm.tm_hour; rtc_time.tm_mday = tm.tm_mday; rtc_time.tm_mon = tm.tm_mon; rtc_time.tm_year = tm.tm_year; rtc_time.tm_wday = tm.tm_wday; rtc_time.tm_yday = tm.tm_yday; rtc_time.tm_isdst = 0; return rtc_str(buf, end, &rtc_time, spec, fmt); } static noinline_for_stack char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { switch (fmt[1]) { case 'R': return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt); case 'T': return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt); default: return error_string(buf, end, "(%pt?)", spec); } } static noinline_for_stack char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, const char *fmt) { if (!IS_ENABLED(CONFIG_HAVE_CLK)) return error_string(buf, end, "(%pC?)", spec); if (check_pointer(&buf, end, clk, spec)) return buf; #ifdef CONFIG_COMMON_CLK return string(buf, end, __clk_get_name(clk), spec); #else return ptr_to_id(buf, end, clk, spec); #endif } static char *format_flags(char *buf, char *end, unsigned long flags, const struct trace_print_flags *names) { unsigned long mask; for ( ; flags && names->name; names++) { mask = names->mask; if ((flags & mask) != mask) continue; buf = string(buf, end, names->name, default_str_spec); flags &= ~mask; if (flags) { if (buf < end) *buf = '|'; buf++; } } if (flags) buf = number(buf, end, flags, default_flag_spec); return buf; } struct page_flags_fields { int width; int shift; int mask; const struct printf_spec *spec; const char *name; }; static const struct page_flags_fields pff[] = { {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, &default_dec_spec, "section"}, {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, &default_dec_spec, "node"}, {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, &default_dec_spec, "zone"}, {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, &default_flag_spec, "lastcpupid"}, {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, &default_flag_spec, "kasantag"}, }; static char *format_page_flags(char *buf, char *end, unsigned long flags) { unsigned long main_flags = flags & PAGEFLAGS_MASK; bool append = false; int i; buf = number(buf, end, flags, default_flag_spec); if (buf < end) *buf = '('; buf++; /* Page flags from the main area. */ if (main_flags) { buf = format_flags(buf, end, main_flags, pageflag_names); append = true; } /* Page flags from the fields area */ for (i = 0; i < ARRAY_SIZE(pff); i++) { /* Skip undefined fields. */ if (!pff[i].width) continue; /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */ if (append) { if (buf < end) *buf = '|'; buf++; } buf = string(buf, end, pff[i].name, default_str_spec); if (buf < end) *buf = '='; buf++; buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask, *pff[i].spec); append = true; } if (buf < end) *buf = ')'; buf++; return buf; } static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) { unsigned long flags; const struct trace_print_flags *names; if (check_pointer(&buf, end, flags_ptr, spec)) return buf; switch (fmt[1]) { case 'p': return format_page_flags(buf, end, *(unsigned long *)flags_ptr); case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; break; case 'g': flags = (__force unsigned long)(*(gfp_t *)flags_ptr); names = gfpflag_names; break; default: return error_string(buf, end, "(%pG?)", spec); } return format_flags(buf, end, flags, names); } static noinline_for_stack char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, char *end) { int depth; /* Loop starting from the root node to the current node. */ for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) { /* * Only get a reference for other nodes (i.e. parent nodes). * fwnode refcount may be 0 here. */ struct fwnode_handle *__fwnode = depth ? fwnode_get_nth_parent(fwnode, depth) : fwnode; buf = string(buf, end, fwnode_get_name_prefix(__fwnode), default_str_spec); buf = string(buf, end, fwnode_get_name(__fwnode), default_str_spec); if (depth) fwnode_handle_put(__fwnode); } return buf; } static noinline_for_stack char *device_node_string(char *buf, char *end, struct device_node *dn, struct printf_spec spec, const char *fmt) { char tbuf[sizeof("xxxx") + 1]; const char *p; int ret; char *buf_start = buf; struct property *prop; bool has_mult, pass; struct printf_spec str_spec = spec; str_spec.field_width = -1; if (fmt[0] != 'F') return error_string(buf, end, "(%pO?)", spec); if (!IS_ENABLED(CONFIG_OF)) return error_string(buf, end, "(%pOF?)", spec); if (check_pointer(&buf, end, dn, spec)) return buf; /* simple case without anything any more format specifiers */ fmt++; if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0) fmt = "f"; for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) { int precision; if (pass) { if (buf < end) *buf = ':'; buf++; } switch (*fmt) { case 'f': /* full_name */ buf = fwnode_full_name_string(of_fwnode_handle(dn), buf, end); break; case 'n': /* name */ p = fwnode_get_name(of_fwnode_handle(dn)); precision = str_spec.precision; str_spec.precision = strchrnul(p, '@') - p; buf = string(buf, end, p, str_spec); str_spec.precision = precision; break; case 'p': /* phandle */ buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec); break; case 'P': /* path-spec */ p = fwnode_get_name(of_fwnode_handle(dn)); if (!p[1]) p = "/"; buf = string(buf, end, p, str_spec); break; case 'F': /* flags */ tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-'; tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-'; tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-'; tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-'; tbuf[4] = 0; buf = string_nocheck(buf, end, tbuf, str_spec); break; case 'c': /* major compatible string */ ret = of_property_read_string(dn, "compatible", &p); if (!ret) buf = string(buf, end, p, str_spec); break; case 'C': /* full compatible string */ has_mult = false; of_property_for_each_string(dn, "compatible", prop, p) { if (has_mult) buf = string_nocheck(buf, end, ",", str_spec); buf = string_nocheck(buf, end, "\"", str_spec); buf = string(buf, end, p, str_spec); buf = string_nocheck(buf, end, "\"", str_spec); has_mult = true; } break; default: break; } } return widen_string(buf, buf - buf_start, end, spec); } static noinline_for_stack char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, struct printf_spec spec, const char *fmt) { struct printf_spec str_spec = spec; char *buf_start = buf; str_spec.field_width = -1; if (*fmt != 'w') return error_string(buf, end, "(%pf?)", spec); if (check_pointer(&buf, end, fwnode, spec)) return buf; fmt++; switch (*fmt) { case 'P': /* name */ buf = string(buf, end, fwnode_get_name(fwnode), str_spec); break; case 'f': /* full_name */ default: buf = fwnode_full_name_string(fwnode, buf, end); break; } return widen_string(buf, buf - buf_start, end, spec); } static noinline_for_stack char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { if (*fmt == 'r' && fmt[1] == 'a') return range_string(buf, end, ptr, spec, fmt); return resource_string(buf, end, ptr, spec, fmt); } int __init no_hash_pointers_enable(char *str) { if (no_hash_pointers) return 0; no_hash_pointers = true; pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** This system shows unhashed kernel memory addresses **\n"); pr_warn("** via the console, logs, and other interfaces. This **\n"); pr_warn("** might reduce the security of your system. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your system **\n"); pr_warn("** administrator! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); return 0; } early_param("no_hash_pointers", no_hash_pointers_enable); /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format * specifiers. * * Please update scripts/checkpatch.pl when adding/removing conversion * characters. (Search for "check for vsprintf extension"). * * Right now we handle: * * - 'S' For symbolic direct pointers (or function descriptors) with offset * - 's' For symbolic direct pointers (or function descriptors) without offset * - '[Ss]R' as above with __builtin_extract_return_addr() translation * - 'S[R]b' as above with module build ID (for use in backtraces) * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of * %ps and %pS. Be careful when re-using these specifiers. * - 'B' For backtraced symbolic direct pointers with offset * - 'Bb' as above with module build ID (for use in backtraces) * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff] * - 'b[l]' For a bitmap, the number of bits is determined by the field * width which must be explicitly specified either as part of the * format string '%32b[l]' or through '%*b[l]', [l] selects * range-list format instead of hex format * - 'M' For a 6-byte MAC address, it prints the address in the * usual colon-separated hex notation * - 'm' For a 6-byte MAC address, it prints the hex address without colons * - 'MF' For a 6-byte MAC FDDI address, it prints the address * with a dash-separated hex notation * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth) * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way * IPv4 uses dot-separated decimal without leading 0's (1.2.3.4) * IPv6 uses colon separated network-order 16 bit hex with leading 0's * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - 'i' [46] for 'raw' IPv4/IPv6 addresses * IPv6 omits the colons (01020304...0f) * IPv4 uses dot-separated decimal with leading 0's (010.123.045.006) * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order * - 'I[6S]c' for IPv6 addresses printed as specified by * https://tools.ietf.org/html/rfc5952 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination * of the following flags (see string_escape_mem() for the * details): * a - ESCAPE_ANY * c - ESCAPE_SPECIAL * h - ESCAPE_HEX * n - ESCAPE_NULL * o - ESCAPE_OCTAL * p - ESCAPE_NP * s - ESCAPE_SPACE * By default ESCAPE_ANY_NP is used. * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form * "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" * Options for %pU are: * b big endian lower case hex (default) * B big endian UPPER case hex * l little endian lower case hex * L little endian UPPER case hex * big endian output byte order is: * [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15] * little endian output byte order is: * [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15] * - 'V' For a struct va_format which contains a format string * and va_list *, * call vsnprintf(->format, *->va_list). * Implements a "recursive vsnprintf". * Do not use this feature without some mechanism to verify the * correctness of the format string and va_list arguments. * - 'K' For a kernel pointer that should be hidden from unprivileged users. * Use only for procfs, sysfs and similar files, not printk(); please * read the documentation (path below) first. * - 'NF' For a netdev_features_t * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value. * - '4c[h[R]lb]' For generic FourCC code with raw numerical value. Both are * displayed in the big-endian format. This is the opposite of V4L2 or * DRM FourCCs. * The additional specifiers define what endianness is used to load * the stored bytes. The data might be interpreted using the host, * reversed host byte order, little-endian, or big-endian. * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with * a certain separator (' ' by default): * C colon * D dash * N no separator * The maximum supported length is 64 bytes of the input. Consider * to use print_hex_dump() for the larger input. * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives * (default assumed to be phys_addr_t, passed by reference) * - 'd[234]' For a dentry name (optionally 2-4 last components) * - 'D[234]' Same as 'd' but for a struct file * - 'g' For block_device name (gendisk + partition number) * - 't[RT][dt][r][s]' For time and date as represented by: * R struct rtc_time * T time64_t * - 'C' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'G' For flags to be printed as a collection of symbolic strings that would * construct the specific value. Supported flags given by option: * p page flags (see struct page) given as pointer to unsigned long * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t * v vma flags (VM_*) given as pointer to unsigned long * - 'OF[fnpPcCF]' For a device tree object * Without any optional arguments prints the full_name * f device node full_name * n device node name * p device node phandle * P device node path spec (name + @unit) * F device node flags * c major compatible string * C full compatible string * - 'fw[fP]' For a firmware node (struct fwnode_handle) pointer * Without an option prints the full name of the node * f full name * P node name, including a possible unit address * - 'x' For printing the address unmodified. Equivalent to "%lx". * Please read the documentation (path below) before using! * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of * bpf_trace_printk() where [ku] prefix specifies either kernel (k) * or user (u) memory to probe, and: * s a string, equivalent to "%s" on direct vsnprintf() use * * ** When making changes please also update: * Documentation/core-api/printk-formats.rst * * Note: The default behaviour (unadorned %p) is to hash the address, * rendering it useful as a unique identifier. * * There is also a '%pA' format specifier, but it is only intended to be used * from Rust code to format core::fmt::Arguments. Do *not* use it from C. * See rust/kernel/print.rs for details. */ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { switch (*fmt) { case 'S': case 's': ptr = dereference_symbol_descriptor(ptr); fallthrough; case 'B': return symbol_string(buf, end, ptr, spec, fmt); case 'R': case 'r': return resource_or_range(fmt, buf, end, ptr, spec); case 'h': return hex_string(buf, end, ptr, spec, fmt); case 'b': switch (fmt[1]) { case 'l': return bitmap_list_string(buf, end, ptr, spec, fmt); default: return bitmap_string(buf, end, ptr, spec, fmt); } case 'M': /* Colon separated: 00:01:02:03:04:05 */ case 'm': /* Contiguous: 000102030405 */ /* [mM]F (FDDI) */ /* [mM]R (Reverse order; Bluetooth) */ return mac_address_string(buf, end, ptr, spec, fmt); case 'I': /* Formatted IP supported * 4: 1.2.3.4 * 6: 0001:0203:...:0708 * 6c: 1::708 or 1::1.2.3.4 */ case 'i': /* Contiguous: * 4: 001.002.003.004 * 6: 000102...0f */ return ip_addr_string(buf, end, ptr, spec, fmt); case 'E': return escaped_string(buf, end, ptr, spec, fmt); case 'U': return uuid_string(buf, end, ptr, spec, fmt); case 'V': return va_format(buf, end, ptr, spec); case 'K': return restricted_pointer(buf, end, ptr, spec); case 'N': return netdev_bits(buf, end, ptr, spec, fmt); case '4': return fourcc_string(buf, end, ptr, spec, fmt); case 'a': return address_val(buf, end, ptr, spec, fmt); case 'd': return dentry_name(buf, end, ptr, spec, fmt); case 't': return time_and_date(buf, end, ptr, spec, fmt); case 'C': return clock(buf, end, ptr, spec, fmt); case 'D': return file_dentry_name(buf, end, ptr, spec, fmt); #ifdef CONFIG_BLOCK case 'g': return bdev_name(buf, end, ptr, spec, fmt); #endif case 'G': return flags_string(buf, end, ptr, spec, fmt); case 'O': return device_node_string(buf, end, ptr, spec, fmt + 1); case 'f': return fwnode_string(buf, end, ptr, spec, fmt + 1); case 'A': if (!IS_ENABLED(CONFIG_RUST)) { WARN_ONCE(1, "Please remove %%pA from non-Rust code\n"); return error_string(buf, end, "(%pA?)", spec); } return rust_fmt_argument(buf, end, ptr); case 'x': return pointer_string(buf, end, ptr, spec); case 'e': /* %pe with a non-ERR_PTR gets treated as plain %p */ if (!IS_ERR(ptr)) return default_pointer(buf, end, ptr, spec); return err_ptr(buf, end, ptr, spec); case 'u': case 'k': switch (fmt[1]) { case 's': return string(buf, end, ptr, spec); default: return error_string(buf, end, "(einval)", spec); } default: return default_pointer(buf, end, ptr, spec); } } struct fmt { const char *str; unsigned char state; // enum format_state unsigned char size; // size of numbers }; #define SPEC_CHAR(x, flag) [(x)-32] = flag static unsigned char spec_flag(unsigned char c) { static const unsigned char spec_flag_array[] = { SPEC_CHAR(' ', SPACE), SPEC_CHAR('#', SPECIAL), SPEC_CHAR('+', PLUS), SPEC_CHAR('-', LEFT), SPEC_CHAR('0', ZEROPAD), }; c -= 32; return (c < sizeof(spec_flag_array)) ? spec_flag_array[c] : 0; } /* * Helper function to decode printf style format. * Each call decode a token from the format and return the * number of characters read (or likely the delta where it wants * to go on the next call). * The decoded token is returned through the parameters * * 'h', 'l', or 'L' for integer fields * 'z' support added 23/7/1999 S.H. * 'z' changed to 'Z' --davidm 1/25/99 * 'Z' changed to 'z' --adobriyan 2017-01-25 * 't' added for ptrdiff_t * * @fmt: the format string * @type of the token returned * @flags: various flags such as +, -, # tokens.. * @field_width: overwritten width * @base: base of the number (octal, hex, ...) * @precision: precision of a number * @qualifier: qualifier of a number (long, size_t, ...) */ static noinline_for_stack struct fmt format_decode(struct fmt fmt, struct printf_spec *spec) { const char *start = fmt.str; char flag; /* we finished early by reading the field width */ if (unlikely(fmt.state == FORMAT_STATE_WIDTH)) { if (spec->field_width < 0) { spec->field_width = -spec->field_width; spec->flags |= LEFT; } fmt.state = FORMAT_STATE_NONE; goto precision; } /* we finished early by reading the precision */ if (unlikely(fmt.state == FORMAT_STATE_PRECISION)) { if (spec->precision < 0) spec->precision = 0; fmt.state = FORMAT_STATE_NONE; goto qualifier; } /* By default */ fmt.state = FORMAT_STATE_NONE; for (; *fmt.str ; fmt.str++) { if (*fmt.str == '%') break; } /* Return the current non-format string */ if (fmt.str != start || !*fmt.str) return fmt; /* Process flags. This also skips the first '%' */ spec->flags = 0; do { /* this also skips first '%' */ flag = spec_flag(*++fmt.str); spec->flags |= flag; } while (flag); /* get field width */ spec->field_width = -1; if (isdigit(*fmt.str)) spec->field_width = skip_atoi(&fmt.str); else if (unlikely(*fmt.str == '*')) { /* it's the next argument */ fmt.state = FORMAT_STATE_WIDTH; fmt.str++; return fmt; } precision: /* get the precision */ spec->precision = -1; if (unlikely(*fmt.str == '.')) { fmt.str++; if (isdigit(*fmt.str)) { spec->precision = skip_atoi(&fmt.str); if (spec->precision < 0) spec->precision = 0; } else if (*fmt.str == '*') { /* it's the next argument */ fmt.state = FORMAT_STATE_PRECISION; fmt.str++; return fmt; } } qualifier: /* Set up default numeric format */ spec->base = 10; fmt.state = FORMAT_STATE_NUM; fmt.size = sizeof(int); static const struct format_state { unsigned char state; unsigned char size; unsigned char flags_or_double_size; unsigned char base; } lookup_state[256] = { // Length ['l'] = { 0, sizeof(long), sizeof(long long) }, ['L'] = { 0, sizeof(long long) }, ['h'] = { 0, sizeof(short), sizeof(char) }, ['H'] = { 0, sizeof(char) }, // Questionable historical ['z'] = { 0, sizeof(size_t) }, ['t'] = { 0, sizeof(ptrdiff_t) }, // Non-numeric formats ['c'] = { FORMAT_STATE_CHAR }, ['s'] = { FORMAT_STATE_STR }, ['p'] = { FORMAT_STATE_PTR }, ['%'] = { FORMAT_STATE_PERCENT_CHAR }, // Numerics ['o'] = { FORMAT_STATE_NUM, 0, 0, 8 }, ['x'] = { FORMAT_STATE_NUM, 0, SMALL, 16 }, ['X'] = { FORMAT_STATE_NUM, 0, 0, 16 }, ['d'] = { FORMAT_STATE_NUM, 0, SIGN, 10 }, ['i'] = { FORMAT_STATE_NUM, 0, SIGN, 10 }, ['u'] = { FORMAT_STATE_NUM, 0, 0, 10, }, /* * Since %n poses a greater security risk than * utility, treat it as any other invalid or * unsupported format specifier. */ }; const struct format_state *p = lookup_state + (u8)*fmt.str; if (p->size) { fmt.size = p->size; if (p->flags_or_double_size && fmt.str[0] == fmt.str[1]) { fmt.size = p->flags_or_double_size; fmt.str++; } fmt.str++; p = lookup_state + *fmt.str; } if (p->state) { if (p->base) spec->base = p->base; spec->flags |= p->flags_or_double_size; fmt.state = p->state; fmt.str++; return fmt; } WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt.str); fmt.state = FORMAT_STATE_INVALID; return fmt; } static void set_field_width(struct printf_spec *spec, int width) { spec->field_width = width; if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) { spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX); } } static void set_precision(struct printf_spec *spec, int prec) { spec->precision = prec; if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) { spec->precision = clamp(prec, 0, PRECISION_MAX); } } /* * Turn a 1/2/4-byte value into a 64-bit one for printing: truncate * as necessary and deal with signedness. * * 'size' is the size of the value in bytes. */ static unsigned long long convert_num_spec(unsigned int val, int size, struct printf_spec spec) { unsigned int shift = 32 - size*8; val <<= shift; if (!(spec.flags & SIGN)) return val >> shift; return (int)val >> shift; } /** * vsnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt_str: The format string to use * @args: Arguments for the format string * * This function generally follows C99 vsnprintf, but has some * extensions and a few limitations: * * - ``%n`` is unsupported * - ``%p*`` is handled by pointer() * * See pointer() or Documentation/core-api/printk-formats.rst for more * extensive description. * * **Please update the documentation in both places when making changes** * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. * * If you're not already dealing with a va_list consider using snprintf(). */ int vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args) { char *str, *end; struct printf_spec spec = {0}; struct fmt fmt = { .str = fmt_str, .state = FORMAT_STATE_NONE, }; /* Reject out-of-range values early. Large positive sizes are used for unknown buffer sizes. */ if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt.str) { const char *old_fmt = fmt.str; fmt = format_decode(fmt, &spec); switch (fmt.state) { case FORMAT_STATE_NONE: { int read = fmt.str - old_fmt; if (str < end) { int copy = read; if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; continue; } case FORMAT_STATE_NUM: { unsigned long long num; if (fmt.size <= sizeof(int)) num = convert_num_spec(va_arg(args, int), fmt.size, spec); else num = va_arg(args, long long); str = number(str, end, num, spec); continue; } case FORMAT_STATE_WIDTH: set_field_width(&spec, va_arg(args, int)); continue; case FORMAT_STATE_PRECISION: set_precision(&spec, va_arg(args, int)); continue; case FORMAT_STATE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) va_arg(args, int); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } continue; } case FORMAT_STATE_STR: str = string(str, end, va_arg(args, char *), spec); continue; case FORMAT_STATE_PTR: str = pointer(fmt.str, str, end, va_arg(args, void *), spec); while (isalnum(*fmt.str)) fmt.str++; continue; case FORMAT_STATE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; continue; default: /* * Presumably the arguments passed gcc's type * checking, but there is no safe or sane way * for us to continue parsing the format and * fetching from the va_list; the remaining * specifiers and arguments would be out of * sync. */ goto out; } } out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } /* the trailing null byte doesn't count towards the total */ return str-buf; } EXPORT_SYMBOL(vsnprintf); /** * vscnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * The return value is the number of characters which have been written into * the @buf not including the trailing '\0'. If @size is == 0 the function * returns 0. * * If you're not already dealing with a va_list consider using scnprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; if (unlikely(!size)) return 0; i = vsnprintf(buf, size, fmt, args); if (likely(i < size)) return i; return size - 1; } EXPORT_SYMBOL(vscnprintf); /** * snprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters which would be * generated for the given input, excluding the trailing null, * as per ISO C99. If the return is greater than or equal to * @size, the resulting string is truncated. * * See the vsnprintf() documentation for format string extensions over C99. */ int snprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(snprintf); /** * scnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including * the trailing '\0'. If @size is == 0 the function returns 0. */ int scnprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vscnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(scnprintf); /** * vsprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @args: Arguments for the format string * * The function returns the number of characters written * into @buf. Use vsnprintf() or vscnprintf() in order to avoid * buffer overflows. * * If you're not already dealing with a va_list consider using sprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vsprintf(char *buf, const char *fmt, va_list args) { return vsnprintf(buf, INT_MAX, fmt, args); } EXPORT_SYMBOL(vsprintf); /** * sprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @...: Arguments for the format string * * The function returns the number of characters written * into @buf. Use snprintf() or scnprintf() in order to avoid * buffer overflows. * * See the vsnprintf() documentation for format string extensions over C99. */ int sprintf(char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, INT_MAX, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sprintf); #ifdef CONFIG_BINARY_PRINTF /* * bprintf service: * vbin_printf() - VA arguments to binary data * bstr_printf() - Binary data to text string */ /** * vbin_printf - Parse a format string and place args' binary value in a buffer * @bin_buf: The buffer to place args' binary value * @size: The size of the buffer(by words(32bits), not characters) * @fmt_str: The format string to use * @args: Arguments for the format string * * The format follows C99 vsnprintf, except %n is ignored, and its argument * is skipped. * * The return value is the number of words(32bits) which would be generated for * the given input. * * NOTE: * If the return value is greater than @size, the resulting bin_buf is NOT * valid for bstr_printf(). */ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt_str, va_list args) { struct fmt fmt = { .str = fmt_str, .state = FORMAT_STATE_NONE, }; struct printf_spec spec = {0}; char *str, *end; int width; str = (char *)bin_buf; end = (char *)(bin_buf + size); #define save_arg(type) \ ({ \ unsigned long long value; \ if (sizeof(type) == 8) { \ unsigned long long val8; \ str = PTR_ALIGN(str, sizeof(u32)); \ val8 = va_arg(args, unsigned long long); \ if (str + sizeof(type) <= end) { \ *(u32 *)str = *(u32 *)&val8; \ *(u32 *)(str + 4) = *((u32 *)&val8 + 1); \ } \ value = val8; \ } else { \ unsigned int val4; \ str = PTR_ALIGN(str, sizeof(type)); \ val4 = va_arg(args, int); \ if (str + sizeof(type) <= end) \ *(typeof(type) *)str = (type)(long)val4; \ value = (unsigned long long)val4; \ } \ str += sizeof(type); \ value; \ }) while (*fmt.str) { fmt = format_decode(fmt, &spec); switch (fmt.state) { case FORMAT_STATE_NONE: case FORMAT_STATE_PERCENT_CHAR: break; case FORMAT_STATE_INVALID: goto out; case FORMAT_STATE_WIDTH: case FORMAT_STATE_PRECISION: width = (int)save_arg(int); /* Pointers may require the width */ if (*fmt.str == 'p') set_field_width(&spec, width); break; case FORMAT_STATE_CHAR: save_arg(char); break; case FORMAT_STATE_STR: { const char *save_str = va_arg(args, char *); const char *err_msg; size_t len; err_msg = check_pointer_msg(save_str); if (err_msg) save_str = err_msg; len = strlen(save_str) + 1; if (str + len < end) memcpy(str, save_str, len); str += len; break; } case FORMAT_STATE_PTR: /* Dereferenced pointers must be done now */ switch (*fmt.str) { /* Dereference of functions is still OK */ case 'S': case 's': case 'x': case 'K': case 'e': save_arg(void *); break; default: if (!isalnum(*fmt.str)) { save_arg(void *); break; } str = pointer(fmt.str, str, end, va_arg(args, void *), spec); if (str + 1 < end) *str++ = '\0'; else end[-1] = '\0'; /* Must be nul terminated */ } /* skip all alphanumeric pointer suffixes */ while (isalnum(*fmt.str)) fmt.str++; break; case FORMAT_STATE_NUM: if (fmt.size > sizeof(int)) { save_arg(long long); } else { save_arg(int); } } } out: return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; #undef save_arg } EXPORT_SYMBOL_GPL(vbin_printf); /** * bstr_printf - Format a string from binary arguments and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt_str: The format string to use * @bin_buf: Binary arguments for the format string * * This function like C99 vsnprintf, but the difference is that vsnprintf gets * arguments from stack, and bstr_printf gets arguments from @bin_buf which is * a binary buffer that generated by vbin_printf. * * The format follows C99 vsnprintf, but has some extensions: * see vsnprintf comment for details. * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. */ int bstr_printf(char *buf, size_t size, const char *fmt_str, const u32 *bin_buf) { struct fmt fmt = { .str = fmt_str, .state = FORMAT_STATE_NONE, }; struct printf_spec spec = {0}; char *str, *end; const char *args = (const char *)bin_buf; if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; #define get_arg(type) \ ({ \ typeof(type) value; \ if (sizeof(type) == 8) { \ args = PTR_ALIGN(args, sizeof(u32)); \ *(u32 *)&value = *(u32 *)args; \ *((u32 *)&value + 1) = *(u32 *)(args + 4); \ } else { \ args = PTR_ALIGN(args, sizeof(type)); \ value = *(typeof(type) *)args; \ } \ args += sizeof(type); \ value; \ }) /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt.str) { const char *old_fmt = fmt.str; unsigned long long num; fmt = format_decode(fmt, &spec); switch (fmt.state) { case FORMAT_STATE_NONE: { int read = fmt.str - old_fmt; if (str < end) { int copy = read; if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; continue; } case FORMAT_STATE_WIDTH: set_field_width(&spec, get_arg(int)); continue; case FORMAT_STATE_PRECISION: set_precision(&spec, get_arg(int)); continue; case FORMAT_STATE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) get_arg(char); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } continue; } case FORMAT_STATE_STR: { const char *str_arg = args; args += strlen(str_arg) + 1; str = string(str, end, (char *)str_arg, spec); continue; } case FORMAT_STATE_PTR: { bool process = false; int copy, len; /* Non function dereferences were already done */ switch (*fmt.str) { case 'S': case 's': case 'x': case 'K': case 'e': process = true; break; default: if (!isalnum(*fmt.str)) { process = true; break; } /* Pointer dereference was already processed */ if (str < end) { len = copy = strlen(args); if (copy > end - str) copy = end - str; memcpy(str, args, copy); str += len; args += len + 1; } } if (process) str = pointer(fmt.str, str, end, get_arg(void *), spec); while (isalnum(*fmt.str)) fmt.str++; continue; } case FORMAT_STATE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; continue; case FORMAT_STATE_INVALID: goto out; case FORMAT_STATE_NUM: if (fmt.size > sizeof(int)) { num = get_arg(long long); } else { num = convert_num_spec(get_arg(int), fmt.size, spec); } str = number(str, end, num, spec); continue; } } /* while(*fmt.str) */ out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } #undef get_arg /* the trailing null byte doesn't count towards the total */ return str - buf; } EXPORT_SYMBOL_GPL(bstr_printf); #endif /* CONFIG_BINARY_PRINTF */ /** * vsscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: format of buffer * @args: arguments */ int vsscanf(const char *buf, const char *fmt, va_list args) { const char *str = buf; char *next; char digit; int num = 0; u8 qualifier; unsigned int base; union { long long s; unsigned long long u; } val; s16 field_width; bool is_sign; while (*fmt) { /* skip any white space in format */ /* white space in format matches any amount of * white space, including none, in the input. */ if (isspace(*fmt)) { fmt = skip_spaces(++fmt); str = skip_spaces(str); } /* anything that is not a conversion must match exactly */ if (*fmt != '%' && *fmt) { if (*fmt++ != *str++) break; continue; } if (!*fmt) break; ++fmt; /* skip this conversion. * advance both strings to next white space */ if (*fmt == '*') { if (!*str) break; while (!isspace(*fmt) && *fmt != '%' && *fmt) { /* '%*[' not yet supported, invalid format */ if (*fmt == '[') return num; fmt++; } while (!isspace(*str) && *str) str++; continue; } /* get field width */ field_width = -1; if (isdigit(*fmt)) { field_width = skip_atoi(&fmt); if (field_width <= 0) break; } /* get conversion qualifier */ qualifier = -1; if (*fmt == 'h' || _tolower(*fmt) == 'l' || *fmt == 'z') { qualifier = *fmt++; if (unlikely(qualifier == *fmt)) { if (qualifier == 'h') { qualifier = 'H'; fmt++; } else if (qualifier == 'l') { qualifier = 'L'; fmt++; } } } if (!*fmt) break; if (*fmt == 'n') { /* return number of characters read so far */ *va_arg(args, int *) = str - buf; ++fmt; continue; } if (!*str) break; base = 10; is_sign = false; switch (*fmt++) { case 'c': { char *s = (char *)va_arg(args, char*); if (field_width == -1) field_width = 1; do { *s++ = *str++; } while (--field_width > 0 && *str); num++; } continue; case 's': { char *s = (char *)va_arg(args, char *); if (field_width == -1) field_width = SHRT_MAX; /* first, skip leading white space in buffer */ str = skip_spaces(str); /* now copy until next white space */ while (*str && !isspace(*str) && field_width--) *s++ = *str++; *s = '\0'; num++; } continue; /* * Warning: This implementation of the '[' conversion specifier * deviates from its glibc counterpart in the following ways: * (1) It does NOT support ranges i.e. '-' is NOT a special * character * (2) It cannot match the closing bracket ']' itself * (3) A field width is required * (4) '%*[' (discard matching input) is currently not supported * * Example usage: * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]", * buf1, buf2, buf3); * if (ret < 3) * // etc.. */ case '[': { char *s = (char *)va_arg(args, char *); DECLARE_BITMAP(set, 256) = {0}; unsigned int len = 0; bool negate = (*fmt == '^'); /* field width is required */ if (field_width == -1) return num; if (negate) ++fmt; for ( ; *fmt && *fmt != ']'; ++fmt, ++len) __set_bit((u8)*fmt, set); /* no ']' or no character set found */ if (!*fmt || !len) return num; ++fmt; if (negate) { bitmap_complement(set, set, 256); /* exclude null '\0' byte */ __clear_bit(0, set); } /* match must be non-empty */ if (!test_bit((u8)*str, set)) return num; while (test_bit((u8)*str, set) && field_width--) *s++ = *str++; *s = '\0'; ++num; } continue; case 'o': base = 8; break; case 'x': case 'X': base = 16; break; case 'i': base = 0; fallthrough; case 'd': is_sign = true; fallthrough; case 'u': break; case '%': /* looking for '%' in str */ if (*str++ != '%') return num; continue; default: /* invalid format; stop here */ return num; } /* have some sort of integer conversion. * first, skip white space in buffer. */ str = skip_spaces(str); digit = *str; if (is_sign && digit == '-') { if (field_width == 1) break; digit = *(str + 1); } if (!digit || (base == 16 && !isxdigit(digit)) || (base == 10 && !isdigit(digit)) || (base == 8 && !isodigit(digit)) || (base == 0 && !isdigit(digit))) break; if (is_sign) val.s = simple_strntoll(str, &next, base, field_width >= 0 ? field_width : INT_MAX); else val.u = simple_strntoull(str, &next, base, field_width >= 0 ? field_width : INT_MAX); switch (qualifier) { case 'H': /* that's 'hh' in format */ if (is_sign) *va_arg(args, signed char *) = val.s; else *va_arg(args, unsigned char *) = val.u; break; case 'h': if (is_sign) *va_arg(args, short *) = val.s; else *va_arg(args, unsigned short *) = val.u; break; case 'l': if (is_sign) *va_arg(args, long *) = val.s; else *va_arg(args, unsigned long *) = val.u; break; case 'L': if (is_sign) *va_arg(args, long long *) = val.s; else *va_arg(args, unsigned long long *) = val.u; break; case 'z': *va_arg(args, size_t *) = val.u; break; default: if (is_sign) *va_arg(args, int *) = val.s; else *va_arg(args, unsigned int *) = val.u; break; } num++; if (!next) break; str = next; } return num; } EXPORT_SYMBOL(vsscanf); /** * sscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: formatting of buffer * @...: resulting arguments */ int sscanf(const char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsscanf(buf, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sscanf);
35 35 498 498 498 498 498 498 498 551 548 22 551 498 551 15 9 11 11 11 11 9 5 6 5 498 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_est.c: simple rate estimator for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * Affected data: est_list and est_lock. * estimation_timer() runs with timer per netns. * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> #include <linux/rcupdate_wait.h> #include <net/ip_vs.h> /* This code is to estimate rate in a shorter interval (such as 8 seconds) for virtual services and real servers. For measure rate in a long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W where W = 2^(-2) NOTES. * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. * Netlink users can see 64-bit values but sockopt users are restricted to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c KEY POINTS: - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled - kthreads read the cpustats to update the estimators (svcs, dests, total) - the states of estimators can be read (get stats) or modified (zero stats) from processes KTHREADS: - estimators are added initially to est_temp_list and later kthread 0 distributes them to one or many kthreads for estimation - kthread contexts are created and attached to array - the kthread tasks are started when first service is added, before that the total stats are not estimated - when configuration (cpulist/nice) is changed, the tasks are restarted by work (est_reload_work) - kthread tasks are stopped while the cpulist is empty - the kthread context holds lists with estimators (chains) which are processed every 2 seconds - as estimators can be added dynamically and in bursts, we try to spread them to multiple chains which are estimated at different time - on start, kthread 0 enters calculation phase to determine the chain limits and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data */ static struct lock_class_key __ipvs_est_key; static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; hlist_for_each_entry_rcu(e, chain, list) { u64 conns, inpkts, outpkts, inbytes, outbytes; u64 kconns = 0, kinpkts = 0, koutpkts = 0; u64 kinbytes = 0, koutbytes = 0; unsigned int start; int i; if (kthread_should_stop()) break; s = container_of(e, struct ip_vs_stats, est); for_each_possible_cpu(i) { c = per_cpu_ptr(s->cpustats, i); do { start = u64_stats_fetch_begin(&c->syncp); conns = u64_stats_read(&c->cnt.conns); inpkts = u64_stats_read(&c->cnt.inpkts); outpkts = u64_stats_read(&c->cnt.outpkts); inbytes = u64_stats_read(&c->cnt.inbytes); outbytes = u64_stats_read(&c->cnt.outbytes); } while (u64_stats_fetch_retry(&c->syncp, start)); kconns += conns; kinpkts += inpkts; koutpkts += outpkts; kinbytes += inbytes; koutbytes += outbytes; } spin_lock(&s->lock); s->kstats.conns = kconns; s->kstats.inpkts = kinpkts; s->kstats.outpkts = koutpkts; s->kstats.inbytes = kinbytes; s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; e->last_conns = s->kstats.conns; e->cps += ((s64)rate - (s64)e->cps) >> 2; rate = (s->kstats.inpkts - e->last_inpkts) << 9; e->last_inpkts = s->kstats.inpkts; e->inpps += ((s64)rate - (s64)e->inpps) >> 2; rate = (s->kstats.outpkts - e->last_outpkts) << 9; e->last_outpkts = s->kstats.outpkts; e->outpps += ((s64)rate - (s64)e->outpps) >> 2; /* scaled by 2^5, but divided 2 seconds */ rate = (s->kstats.inbytes - e->last_inbytes) << 4; e->last_inbytes = s->kstats.inbytes; e->inbps += ((s64)rate - (s64)e->inbps) >> 2; rate = (s->kstats.outbytes - e->last_outbytes) << 4; e->last_outbytes = s->kstats.outbytes; e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } } static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) { struct ip_vs_est_tick_data *td; int cid; rcu_read_lock(); td = rcu_dereference(kd->ticks[row]); if (!td) goto out; for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { if (kthread_should_stop()) break; ip_vs_chain_estimation(&td->chains[cid]); cond_resched_rcu(); td = rcu_dereference(kd->ticks[row]); if (!td) break; } out: rcu_read_unlock(); } static int ip_vs_estimation_kthread(void *data) { struct ip_vs_est_kt_data *kd = data; struct netns_ipvs *ipvs = kd->ipvs; int row = kd->est_row; unsigned long now; int id = kd->id; long gap; if (id > 0) { if (!ipvs->est_chain_max) return 0; } else { if (!ipvs->est_chain_max) { ipvs->est_calc_phase = 1; /* commit est_calc_phase before reading est_genid */ smp_mb(); } /* kthread 0 will handle the calc phase */ if (ipvs->est_calc_phase) ip_vs_est_calc_phase(ipvs); } while (1) { if (!id && !hlist_empty(&ipvs->est_temp_list)) ip_vs_est_drain_temp_list(ipvs); set_current_state(TASK_IDLE); if (kthread_should_stop()) break; /* before estimation, check if we should sleep */ now = jiffies; gap = kd->est_timer - now; if (gap > 0) { if (gap > IPVS_EST_TICK) { kd->est_timer = now - IPVS_EST_TICK; gap = IPVS_EST_TICK; } schedule_timeout(gap); } else { __set_current_state(TASK_RUNNING); if (gap < -8 * IPVS_EST_TICK) kd->est_timer = now; } if (kd->tick_len[row]) ip_vs_tick_estimation(kd, row); row++; if (row >= IPVS_EST_NTICKS) row = 0; WRITE_ONCE(kd->est_row, row); kd->est_timer += IPVS_EST_TICK; } __set_current_state(TASK_RUNNING); return 0; } /* Schedule stop/start for kthread tasks */ void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ if (!ipvs->enable) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } /* Start kthread task with current configuration */ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { unsigned long now; int ret = 0; long gap; lockdep_assert_held(&ipvs->est_mutex); if (kd->task) goto out; now = jiffies; gap = kd->est_timer - now; /* Sync est_timer if task is starting later */ if (abs(gap) > 4 * IPVS_EST_TICK) kd->est_timer = now; kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", ipvs->gen, kd->id); if (IS_ERR(kd->task)) { ret = PTR_ERR(kd->task); kd->task = NULL; goto out; } set_user_nice(kd->task, sysctl_est_nice(ipvs)); set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); out: return ret; } void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) { if (kd->task) { pr_info("stopping estimator thread %d...\n", kd->id); kthread_stop(kd->task); kd->task = NULL; } } /* Apply parameters to kthread */ static void ip_vs_est_set_params(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { kd->chain_max = ipvs->est_chain_max; /* We are using single chain on RCU preemption */ if (IPVS_EST_TICK_CHAINS == 1) kd->chain_max *= IPVS_EST_CHAIN_FACTOR; kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; } /* Create and start estimation kthread in a free or new array slot */ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) { struct ip_vs_est_kt_data *kd = NULL; int id = ipvs->est_kt_count; int ret = -ENOMEM; void *arr = NULL; int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && ipvs->enable && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; } if (i >= id) { arr = krealloc_array(ipvs->est_kt_arr, id + 1, sizeof(struct ip_vs_est_kt_data *), GFP_KERNEL); if (!arr) goto out; ipvs->est_kt_arr = arr; } else { id = i; } kd = kzalloc(sizeof(*kd), GFP_KERNEL); if (!kd) goto out; kd->ipvs = ipvs; bitmap_fill(kd->avail, IPVS_EST_NTICKS); kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { kd->calc_stats = ip_vs_stats_alloc(); if (!kd->calc_stats) goto out; } /* Start kthread tasks only when services are present */ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; } if (arr) ipvs->est_kt_count++; ipvs->est_kt_arr[id] = kd; kd = NULL; /* Use most recent kthread for new ests */ ipvs->est_add_ktid = id; ret = 0; out: mutex_unlock(&ipvs->est_mutex); if (kd) { ip_vs_stats_free(kd->calc_stats); kfree(kd); } return ret; } /* Select ktid where to add new ests: available, unused or new slot */ static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) { int ktid, best = ipvs->est_kt_count; struct ip_vs_est_kt_data *kd; for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { kd = ipvs->est_kt_arr[ktid]; if (kd) { if (kd->est_count < kd->est_max_count) { best = ktid; break; } } else if (ktid < best) { best = ktid; } } ipvs->est_add_ktid = best; } /* Add estimator to current kthread (est_add_ktid) */ static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, struct ip_vs_estimator *est) { struct ip_vs_est_kt_data *kd = NULL; struct ip_vs_est_tick_data *td; int ktid, row, crow, cid, ret; int delay = est->ktrow; BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, "Too many chains for ktcid"); if (ipvs->est_add_ktid < ipvs->est_kt_count) { kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; if (kd) goto add_est; } ret = ip_vs_est_add_kthread(ipvs); if (ret < 0) goto out; kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; add_est: ktid = kd->id; /* For small number of estimators prefer to use few ticks, * otherwise try to add into the last estimated row. * est_row and add_row point after the row we should use */ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) crow = READ_ONCE(kd->est_row); else crow = kd->add_row; crow += delay; if (crow >= IPVS_EST_NTICKS) crow -= IPVS_EST_NTICKS; /* Assume initial delay ? */ if (delay >= IPVS_EST_NTICKS - 1) { /* Preserve initial delay or decrease it if no space in tick */ row = crow; if (crow < IPVS_EST_NTICKS - 1) { crow++; row = find_last_bit(kd->avail, crow); } if (row >= crow) row = find_last_bit(kd->avail, IPVS_EST_NTICKS); } else { /* Preserve delay or increase it if no space in tick */ row = IPVS_EST_NTICKS; if (crow > 0) row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); if (row >= IPVS_EST_NTICKS) row = find_first_bit(kd->avail, IPVS_EST_NTICKS); } td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) { td = kzalloc(sizeof(*td), GFP_KERNEL); if (!td) { ret = -ENOMEM; goto out; } rcu_assign_pointer(kd->ticks[row], td); } cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); kd->est_count++; kd->tick_len[row]++; if (!td->chain_len[cid]) __set_bit(cid, td->present); td->chain_len[cid]++; est->ktid = ktid; est->ktrow = row; est->ktcid = cid; hlist_add_head_rcu(&est->list, &td->chains[cid]); if (td->chain_len[cid] >= kd->chain_max) { __set_bit(cid, td->full); if (kd->tick_len[row] >= kd->tick_max) __clear_bit(row, kd->avail); } /* Update est_add_ktid to point to first available/empty kt slot */ if (kd->est_count == kd->est_max_count) ip_vs_est_update_ktid(ipvs); ret = 0; out: return ret; } /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; int ret; if (!ipvs->est_max_threads && ipvs->enable) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ /* We prefer this code to be short, kthread 0 will requeue the * estimator to available chain. If tasks are disabled, we * will not allocate much memory, just for kt 0. */ ret = 0; if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) ret = ip_vs_est_add_kthread(ipvs); if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else INIT_HLIST_NODE(&est->list); return ret; } static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) { if (kd) { if (kd->task) { pr_info("stop unused estimator thread %d...\n", kd->id); kthread_stop(kd->task); } ip_vs_stats_free(kd->calc_stats); kfree(kd); } } /* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; int ktid = est->ktid; int row = est->ktrow; int cid = est->ktcid; /* Failed to add to chain ? */ if (hlist_unhashed(&est->list)) return; /* On return, estimator can be freed, dequeue it now */ /* In est_temp_list ? */ if (ktid < 0) { hlist_del(&est->list); goto end_kt0; } hlist_del_rcu(&est->list); kd = ipvs->est_kt_arr[ktid]; td = rcu_dereference_protected(kd->ticks[row], 1); __clear_bit(cid, td->full); td->chain_len[cid]--; if (!td->chain_len[cid]) __clear_bit(cid, td->present); kd->tick_len[row]--; __set_bit(row, kd->avail); if (!kd->tick_len[row]) { RCU_INIT_POINTER(kd->ticks[row], NULL); kfree_rcu(td, rcu_head); } kd->est_count--; if (kd->est_count) { /* This kt slot can become available just now, prefer it */ if (ktid < ipvs->est_add_ktid) ipvs->est_add_ktid = ktid; return; } if (ktid > 0) { mutex_lock(&ipvs->est_mutex); ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[ktid] = NULL; if (ktid == ipvs->est_kt_count - 1) { ipvs->est_kt_count--; while (ipvs->est_kt_count > 1 && !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) ipvs->est_kt_count--; } mutex_unlock(&ipvs->est_mutex); /* This slot is now empty, prefer another available kt slot */ if (ktid == ipvs->est_add_ktid) ip_vs_est_update_ktid(ipvs); } end_kt0: /* kt 0 is freed after all other kthreads and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; if (!kd || !kd->est_count) { mutex_lock(&ipvs->est_mutex); if (kd) { ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[0] = NULL; } ipvs->est_kt_count--; mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } } } /* Register all ests from est_temp_list to kthreads */ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) { struct ip_vs_estimator *est; while (1) { int max = 16; mutex_lock(&__ip_vs_mutex); while (max-- > 0) { est = hlist_entry_safe(ipvs->est_temp_list.first, struct ip_vs_estimator, list); if (est) { if (kthread_should_stop()) goto unlock; hlist_del_init(&est->list); if (ip_vs_enqueue_estimator(ipvs, est) >= 0) continue; est->ktid = -1; hlist_add_head(&est->list, &ipvs->est_temp_list); /* Abort, some entries will not be estimated * until next attempt */ } goto unlock; } mutex_unlock(&__ip_vs_mutex); cond_resched(); } unlock: mutex_unlock(&__ip_vs_mutex); } /* Calculate limits for all kthreads */ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct ip_vs_est_kt_data *kd; struct hlist_head chain; struct ip_vs_stats *s; int cache_factor = 4; int i, loops, ntest; s32 min_est = 0; ktime_t t1, t2; int max = 8; int ret = 1; s64 diff; u64 val; INIT_HLIST_HEAD(&chain); mutex_lock(&__ip_vs_mutex); kd = ipvs->est_kt_arr[0]; mutex_unlock(&__ip_vs_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; hlist_add_head(&s->est.list, &chain); loops = 1; /* Get best result from many tests */ for (ntest = 0; ntest < 12; ntest++) { if (!(ntest & 3)) { /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); if (!ipvs->enable || kthread_should_stop()) goto stop; } local_bh_disable(); rcu_read_lock(); /* Put stats in cache */ ip_vs_chain_estimation(&chain); t1 = ktime_get(); for (i = loops * cache_factor; i > 0; i--) ip_vs_chain_estimation(&chain); t2 = ktime_get(); rcu_read_unlock(); local_bh_enable(); if (!ipvs->enable || kthread_should_stop()) goto stop; cond_resched(); diff = ktime_to_ns(ktime_sub(t2, t1)); if (diff <= 1 * NSEC_PER_USEC) { /* Do more loops on low time resolution */ loops *= 2; continue; } if (diff >= NSEC_PER_SEC) continue; val = diff; do_div(val, loops); if (!min_est || val < min_est) { min_est = val; /* goal: 95usec per chain */ val = 95 * NSEC_PER_USEC; if (val >= min_est) { do_div(val, min_est); max = (int)val; } else { max = 1; } } } out: if (s) hlist_del_init(&s->est.list); *chain_max = max; return ret; stop: ret = 0; goto out; } /* Calculate the parameters and apply them in context of kt #0 * ECP: est_calc_phase * ECM: est_chain_max * ECP ECM Insert Chain enable Description * --------------------------------------------------------------------------- * 0 0 est_temp_list 0 create kt #0 context * 0 0 est_temp_list 0->1 service added, start kthread #0 task * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase * 1 0 est_temp_list 1 kt #0: determine est_chain_max, * stop tasks, move ests to est_temp_list * and free kd for kthreads 1..last * 1->0 0->N kt chains 1 ests can go to kthreads * 0 N kt chains 1 drain est_temp_list, create new kthread * contexts, start tasks, estimate */ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) { int genid = atomic_read(&ipvs->est_genid); struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; struct ip_vs_estimator *est; struct ip_vs_stats *stats; int id, row, cid, delay; bool last, last_td; int chain_max; int step; if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; mutex_lock(&__ip_vs_mutex); /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ if (!ipvs->enable) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) continue; ip_vs_est_kthread_stop(kd); } mutex_unlock(&ipvs->est_mutex); /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while * we reschedule. Even for kthread 0. */ step = 0; /* Order entries in est_temp_list in ascending delay, so now * walk delay(desc), id(desc), cid(asc) */ delay = IPVS_EST_NTICKS; next_delay: delay--; if (delay < 0) goto end_dequeue; last_kt: /* Destroy contexts backwards */ id = ipvs->est_kt_count; next_kt: if (!ipvs->enable || kthread_should_stop()) goto unlock; id--; if (id < 0) goto next_delay; kd = ipvs->est_kt_arr[id]; if (!kd) goto next_kt; /* kt 0 can exist with empty chains */ if (!id && kd->est_count <= 1) goto next_delay; row = kd->est_row + delay; if (row >= IPVS_EST_NTICKS) row -= IPVS_EST_NTICKS; td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) goto next_kt; cid = 0; walk_chain: if (kthread_should_stop()) goto unlock; step++; if (!(step & 63)) { /* Give chance estimators to be added (to est_temp_list) * and deleted (releasing kthread contexts) */ mutex_unlock(&__ip_vs_mutex); cond_resched(); mutex_lock(&__ip_vs_mutex); /* Current kt released ? */ if (id >= ipvs->est_kt_count) goto last_kt; if (kd != ipvs->est_kt_arr[id]) goto next_kt; /* Current td released ? */ if (td != rcu_dereference_protected(kd->ticks[row], 1)) goto next_kt; /* No fatal changes on the current kd and td */ } est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, list); if (!est) { cid++; if (cid >= IPVS_EST_TICK_CHAINS) goto next_kt; goto walk_chain; } /* We can cheat and increase est_count to protect kt 0 context * from release but we prefer to keep the last estimator */ last = kd->est_count <= 1; /* Do not free kt #0 data */ if (!id && last) goto next_delay; last_td = kd->tick_len[row] <= 1; stats = container_of(est, struct ip_vs_stats, est); ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; est->ktrow = row - kd->est_row; if (est->ktrow < 0) est->ktrow += IPVS_EST_NTICKS; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) goto next_kt; /* td freed ? */ if (last_td) goto next_kt; goto walk_chain; end_dequeue: /* All estimators removed while calculating ? */ if (!ipvs->est_kt_count) goto unlock; kd = ipvs->est_kt_arr[0]; if (!kd) goto unlock; kd->add_row = kd->est_row; ipvs->est_chain_max = chain_max; ip_vs_est_set_params(ipvs, kd); pr_info("using max %d ests per chain, %d per kthread\n", kd->chain_max, kd->est_max_count); /* Try to keep tot_stats in kt0, enqueue it early */ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && ipvs->tot_stats->s.est.ktid == -1) { hlist_del(&ipvs->tot_stats->s.est.list); hlist_add_head(&ipvs->tot_stats->s.est.list, &ipvs->est_temp_list); } mutex_lock(&ipvs->est_mutex); /* We completed the calc phase, new calc phase not requested */ if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; unlock2: mutex_unlock(&ipvs->est_mutex); unlock: mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ est->last_inbytes = k->inbytes; est->last_outbytes = k->outbytes; est->last_conns = k->conns; est->last_inpkts = k->inpkts; est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; est->inbps = 0; est->outbps = 0; } /* Get decoded rates */ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; dst->cps = (e->cps + 0x1FF) >> 10; dst->inpps = (e->inpps + 0x1FF) >> 10; dst->outpps = (e->outpps + 0x1FF) >> 10; dst->inbps = (e->inbps + 0xF) >> 5; dst->outbps = (e->outbps + 0xF) >> 5; } int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_HLIST_HEAD(&ipvs->est_temp_list); ipvs->est_kt_arr = NULL; ipvs->est_max_threads = 0; ipvs->est_calc_phase = 0; ipvs->est_chain_max = 0; ipvs->est_kt_count = 0; ipvs->est_add_ktid = 0; atomic_set(&ipvs->est_genid, 0); atomic_set(&ipvs->est_genid_done, 0); __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { int i; for (i = 0; i < ipvs->est_kt_count; i++) ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); kfree(ipvs->est_kt_arr); mutex_destroy(&ipvs->est_mutex); }
56 56 29 29 26 20 26 20 1 6 32 32 2 2 1 1 1 25 32 12 12 12 12 12 12 12 12 12 3 1 1 1 3 1 3 3 3 3 3 3 2 2 2 1 3 3 3 3 3 3 3 1 1 5 5 1 1 1 1 2 1 3 3 10 11 9 12 5 12 9 1 4 1 1 3 5 4 1 5 9 5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 // SPDX-License-Identifier: GPL-2.0-or-later /* * PPP async serial channel driver for Linux. * * Copyright 1999 Paul Mackerras. * * This driver provides the encapsulation and framing for sending * and receiving PPP frames over async serial lines. It relies on * the generic PPP layer to give it frames to send and to process * received frames. It implements the PPP line discipline. * * Part of the code in this driver was inspired by the old async-only * PPP driver, written by Michael Callahan and Al Longyear, and * subsequently hacked by Paul Mackerras. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/tty.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/crc-ccitt.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <linux/uaccess.h> #include <asm/string.h> #define PPP_VERSION "2.4.2" #define OBUFSIZE 4096 /* Structure for storing local state. */ struct asyncppp { struct tty_struct *tty; unsigned int flags; unsigned int state; unsigned int rbits; int mru; spinlock_t xmit_lock; spinlock_t recv_lock; unsigned long xmit_flags; u32 xaccm[8]; u32 raccm; unsigned int bytes_sent; unsigned int bytes_rcvd; struct sk_buff *tpkt; int tpkt_pos; u16 tfcs; unsigned char *optr; unsigned char *olim; unsigned long last_xmit; struct sk_buff *rpkt; int lcp_fcs; struct sk_buff_head rqueue; struct tasklet_struct tsk; refcount_t refcnt; struct completion dead; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; /* Bit numbers in xmit_flags */ #define XMIT_WAKEUP 0 #define XMIT_FULL 1 #define XMIT_BUSY 2 /* State bits */ #define SC_TOSS 1 #define SC_ESCAPE 2 #define SC_PREV_ERROR 4 /* Bits in rbits */ #define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) static int flag_time = HZ; module_param(flag_time, int, 0); MODULE_PARM_DESC(flag_time, "ppp_async: interval between flagged packets (in clock ticks)"); MODULE_DESCRIPTION("PPP async serial channel module"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_PPP); /* * Prototypes. */ static int ppp_async_encode(struct asyncppp *ap); static int ppp_async_send(struct ppp_channel *chan, struct sk_buff *skb); static int ppp_async_push(struct asyncppp *ap); static void ppp_async_flush_output(struct asyncppp *ap); static void ppp_async_input(struct asyncppp *ap, const unsigned char *buf, const u8 *flags, int count); static int ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg); static void ppp_async_process(struct tasklet_struct *t); static void async_lcp_peek(struct asyncppp *ap, unsigned char *data, int len, int inbound); static const struct ppp_channel_ops async_ops = { .start_xmit = ppp_async_send, .ioctl = ppp_async_ioctl, }; /* * Routines implementing the PPP line discipline. */ /* * We have a potential race on dereferencing tty->disc_data, * because the tty layer provides no locking at all - thus one * cpu could be running ppp_asynctty_receive while another * calls ppp_asynctty_close, which zeroes tty->disc_data and * frees the memory that ppp_asynctty_receive is using. The best * way to fix this is to use a rwlock in the tty struct, but for now * we use a single global rwlock for all ttys in ppp line discipline. * * FIXME: this is no longer true. The _close path for the ldisc is * now guaranteed to be sane. */ static DEFINE_RWLOCK(disc_data_lock); static struct asyncppp *ap_get(struct tty_struct *tty) { struct asyncppp *ap; read_lock(&disc_data_lock); ap = tty->disc_data; if (ap != NULL) refcount_inc(&ap->refcnt); read_unlock(&disc_data_lock); return ap; } static void ap_put(struct asyncppp *ap) { if (refcount_dec_and_test(&ap->refcnt)) complete(&ap->dead); } /* * Called when a tty is put into PPP line discipline. Called in process * context. */ static int ppp_asynctty_open(struct tty_struct *tty) { struct asyncppp *ap; int err; int speed; if (tty->ops->write == NULL) return -EOPNOTSUPP; err = -ENOMEM; ap = kzalloc(sizeof(*ap), GFP_KERNEL); if (!ap) goto out; /* initialize the asyncppp structure */ ap->tty = tty; ap->mru = PPP_MRU; spin_lock_init(&ap->xmit_lock); spin_lock_init(&ap->recv_lock); ap->xaccm[0] = ~0U; ap->xaccm[3] = 0x60000000U; ap->raccm = ~0U; ap->optr = ap->obuf; ap->olim = ap->obuf; ap->lcp_fcs = -1; skb_queue_head_init(&ap->rqueue); tasklet_setup(&ap->tsk, ppp_async_process); refcount_set(&ap->refcnt, 1); init_completion(&ap->dead); ap->chan.private = ap; ap->chan.ops = &async_ops; ap->chan.mtu = PPP_MRU; speed = tty_get_baud_rate(tty); ap->chan.speed = speed; err = ppp_register_channel(&ap->chan); if (err) goto out_free; tty->disc_data = ap; tty->receive_room = 65536; return 0; out_free: kfree(ap); out: return err; } /* * Called when the tty is put into another line discipline * or it hangs up. We have to wait for any cpu currently * executing in any of the other ppp_asynctty_* routines to * finish before we can call ppp_unregister_channel and free * the asyncppp struct. This routine must be called from * process context, not interrupt or softirq context. */ static void ppp_asynctty_close(struct tty_struct *tty) { struct asyncppp *ap; write_lock_irq(&disc_data_lock); ap = tty->disc_data; tty->disc_data = NULL; write_unlock_irq(&disc_data_lock); if (!ap) return; /* * We have now ensured that nobody can start using ap from now * on, but we have to wait for all existing users to finish. * Note that ppp_unregister_channel ensures that no calls to * our channel ops (i.e. ppp_async_send/ioctl) are in progress * by the time it returns. */ if (!refcount_dec_and_test(&ap->refcnt)) wait_for_completion(&ap->dead); tasklet_kill(&ap->tsk); ppp_unregister_channel(&ap->chan); kfree_skb(ap->rpkt); skb_queue_purge(&ap->rqueue); kfree_skb(ap->tpkt); kfree(ap); } /* * Called on tty hangup in process context. * * Wait for I/O to driver to complete and unregister PPP channel. * This is already done by the close routine, so just call that. */ static void ppp_asynctty_hangup(struct tty_struct *tty) { ppp_asynctty_close(tty); } /* * Read does nothing - no data is ever available this way. * Pppd reads and writes packets via /dev/ppp instead. */ static ssize_t ppp_asynctty_read(struct tty_struct *tty, struct file *file, u8 *buf, size_t count, void **cookie, unsigned long offset) { return -EAGAIN; } /* * Write on the tty does nothing, the packets all come in * from the ppp generic stuff. */ static ssize_t ppp_asynctty_write(struct tty_struct *tty, struct file *file, const u8 *buf, size_t count) { return -EAGAIN; } /* * Called in process context only. May be re-entered by multiple * ioctl calling threads. */ static int ppp_asynctty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct asyncppp *ap = ap_get(tty); int err, val; int __user *p = (int __user *)arg; if (!ap) return -ENXIO; err = -EFAULT; switch (cmd) { case PPPIOCGCHAN: err = -EFAULT; if (put_user(ppp_channel_index(&ap->chan), p)) break; err = 0; break; case PPPIOCGUNIT: err = -EFAULT; if (put_user(ppp_unit_number(&ap->chan), p)) break; err = 0; break; case TCFLSH: /* flush our buffers and the serial port's buffer */ if (arg == TCIOFLUSH || arg == TCOFLUSH) ppp_async_flush_output(ap); err = n_tty_ioctl_helper(tty, cmd, arg); break; case FIONREAD: val = 0; if (put_user(val, p)) break; err = 0; break; default: /* Try the various mode ioctls */ err = tty_mode_ioctl(tty, cmd, arg); } ap_put(ap); return err; } /* May sleep, don't call from interrupt level or with interrupts disabled */ static void ppp_asynctty_receive(struct tty_struct *tty, const u8 *buf, const u8 *cflags, size_t count) { struct asyncppp *ap = ap_get(tty); unsigned long flags; if (!ap) return; spin_lock_irqsave(&ap->recv_lock, flags); ppp_async_input(ap, buf, cflags, count); spin_unlock_irqrestore(&ap->recv_lock, flags); if (!skb_queue_empty(&ap->rqueue)) tasklet_schedule(&ap->tsk); ap_put(ap); tty_unthrottle(tty); } static void ppp_asynctty_wakeup(struct tty_struct *tty) { struct asyncppp *ap = ap_get(tty); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); if (!ap) return; set_bit(XMIT_WAKEUP, &ap->xmit_flags); tasklet_schedule(&ap->tsk); ap_put(ap); } static struct tty_ldisc_ops ppp_ldisc = { .owner = THIS_MODULE, .num = N_PPP, .name = "ppp", .open = ppp_asynctty_open, .close = ppp_asynctty_close, .hangup = ppp_asynctty_hangup, .read = ppp_asynctty_read, .write = ppp_asynctty_write, .ioctl = ppp_asynctty_ioctl, .receive_buf = ppp_asynctty_receive, .write_wakeup = ppp_asynctty_wakeup, }; static int __init ppp_async_init(void) { int err; err = tty_register_ldisc(&ppp_ldisc); if (err != 0) printk(KERN_ERR "PPP_async: error %d registering line disc.\n", err); return err; } /* * The following routines provide the PPP channel interface. */ static int ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) { struct asyncppp *ap = chan->private; void __user *argp = (void __user *)arg; int __user *p = argp; int err, val; u32 accm[8]; err = -EFAULT; switch (cmd) { case PPPIOCGFLAGS: val = ap->flags | ap->rbits; if (put_user(val, p)) break; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; ap->flags = val & ~SC_RCV_BITS; spin_lock_irq(&ap->recv_lock); ap->rbits = val & SC_RCV_BITS; spin_unlock_irq(&ap->recv_lock); err = 0; break; case PPPIOCGASYNCMAP: if (put_user(ap->xaccm[0], (u32 __user *)argp)) break; err = 0; break; case PPPIOCSASYNCMAP: if (get_user(ap->xaccm[0], (u32 __user *)argp)) break; err = 0; break; case PPPIOCGRASYNCMAP: if (put_user(ap->raccm, (u32 __user *)argp)) break; err = 0; break; case PPPIOCSRASYNCMAP: if (get_user(ap->raccm, (u32 __user *)argp)) break; err = 0; break; case PPPIOCGXASYNCMAP: if (copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm))) break; err = 0; break; case PPPIOCSXASYNCMAP: if (copy_from_user(accm, argp, sizeof(accm))) break; accm[2] &= ~0x40000000U; /* can't escape 0x5e */ accm[3] |= 0x60000000U; /* must escape 0x7d, 0x7e */ memcpy(ap->xaccm, accm, sizeof(ap->xaccm)); err = 0; break; case PPPIOCGMRU: if (put_user(ap->mru, p)) break; err = 0; break; case PPPIOCSMRU: if (get_user(val, p)) break; if (val > U16_MAX) { err = -EINVAL; break; } if (val < PPP_MRU) val = PPP_MRU; ap->mru = val; err = 0; break; default: err = -ENOTTY; } return err; } /* * This is called at softirq level to deliver received packets * to the ppp_generic code, and to tell the ppp_generic code * if we can accept more output now. */ static void ppp_async_process(struct tasklet_struct *t) { struct asyncppp *ap = from_tasklet(ap, t, tsk); struct sk_buff *skb; /* process received packets */ while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->cb[0]) ppp_input_error(&ap->chan, 0); ppp_input(&ap->chan, skb); } /* try to push more stuff out */ if (test_bit(XMIT_WAKEUP, &ap->xmit_flags) && ppp_async_push(ap)) ppp_output_wakeup(&ap->chan); } /* * Procedures for encapsulation and framing. */ /* * Procedure to encode the data for async serial transmission. * Does octet stuffing (escaping), puts the address/control bytes * on if A/C compression is disabled, and does protocol compression. * Assumes ap->tpkt != 0 on entry. * Returns 1 if we finished the current frame, 0 otherwise. */ #define PUT_BYTE(ap, buf, c, islcp) do { \ if ((islcp && c < 0x20) || (ap->xaccm[c >> 5] & (1 << (c & 0x1f)))) {\ *buf++ = PPP_ESCAPE; \ *buf++ = c ^ PPP_TRANS; \ } else \ *buf++ = c; \ } while (0) static int ppp_async_encode(struct asyncppp *ap) { int fcs, i, count, c, proto; unsigned char *buf, *buflim; unsigned char *data; int islcp; buf = ap->obuf; ap->olim = buf; ap->optr = buf; i = ap->tpkt_pos; data = ap->tpkt->data; count = ap->tpkt->len; fcs = ap->tfcs; proto = get_unaligned_be16(data); /* * LCP packets with code values between 1 (configure-request) * and 7 (code-reject) must be sent as though no options * had been negotiated. */ islcp = proto == PPP_LCP && count >= 3 && 1 <= data[2] && data[2] <= 7; if (i == 0) { if (islcp) async_lcp_peek(ap, data, count, 0); /* * Start of a new packet - insert the leading FLAG * character if necessary. */ if (islcp || flag_time == 0 || time_after_eq(jiffies, ap->last_xmit + flag_time)) *buf++ = PPP_FLAG; ap->last_xmit = jiffies; fcs = PPP_INITFCS; /* * Put in the address/control bytes if necessary */ if ((ap->flags & SC_COMP_AC) == 0 || islcp) { PUT_BYTE(ap, buf, 0xff, islcp); fcs = PPP_FCS(fcs, 0xff); PUT_BYTE(ap, buf, 0x03, islcp); fcs = PPP_FCS(fcs, 0x03); } } /* * Once we put in the last byte, we need to put in the FCS * and closing flag, so make sure there is at least 7 bytes * of free space in the output buffer. */ buflim = ap->obuf + OBUFSIZE - 6; while (i < count && buf < buflim) { c = data[i++]; if (i == 1 && c == 0 && (ap->flags & SC_COMP_PROT)) continue; /* compress protocol field */ fcs = PPP_FCS(fcs, c); PUT_BYTE(ap, buf, c, islcp); } if (i < count) { /* * Remember where we are up to in this packet. */ ap->olim = buf; ap->tpkt_pos = i; ap->tfcs = fcs; return 0; } /* * We have finished the packet. Add the FCS and flag. */ fcs = ~fcs; c = fcs & 0xff; PUT_BYTE(ap, buf, c, islcp); c = (fcs >> 8) & 0xff; PUT_BYTE(ap, buf, c, islcp); *buf++ = PPP_FLAG; ap->olim = buf; consume_skb(ap->tpkt); ap->tpkt = NULL; return 1; } /* * Transmit-side routines. */ /* * Send a packet to the peer over an async tty line. * Returns 1 iff the packet was accepted. * If the packet was not accepted, we will call ppp_output_wakeup * at some later time. */ static int ppp_async_send(struct ppp_channel *chan, struct sk_buff *skb) { struct asyncppp *ap = chan->private; ppp_async_push(ap); if (test_and_set_bit(XMIT_FULL, &ap->xmit_flags)) return 0; /* already full */ ap->tpkt = skb; ap->tpkt_pos = 0; ppp_async_push(ap); return 1; } /* * Push as much data as possible out to the tty. */ static int ppp_async_push(struct asyncppp *ap) { int avail, sent, done = 0; struct tty_struct *tty = ap->tty; int tty_stuffed = 0; /* * We can get called recursively here if the tty write * function calls our wakeup function. This can happen * for example on a pty with both the master and slave * set to PPP line discipline. * We use the XMIT_BUSY bit to detect this and get out, * leaving the XMIT_WAKEUP bit set to tell the other * instance that it may now be able to write more now. */ if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) return 0; spin_lock_bh(&ap->xmit_lock); for (;;) { if (test_and_clear_bit(XMIT_WAKEUP, &ap->xmit_flags)) tty_stuffed = 0; if (!tty_stuffed && ap->optr < ap->olim) { avail = ap->olim - ap->optr; set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); sent = tty->ops->write(tty, ap->optr, avail); if (sent < 0) goto flush; /* error, e.g. loss of CD */ ap->optr += sent; if (sent < avail) tty_stuffed = 1; continue; } if (ap->optr >= ap->olim && ap->tpkt) { if (ppp_async_encode(ap)) { /* finished processing ap->tpkt */ clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } continue; } /* * We haven't made any progress this time around. * Clear XMIT_BUSY to let other callers in, but * after doing so we have to check if anyone set * XMIT_WAKEUP since we last checked it. If they * did, we should try again to set XMIT_BUSY and go * around again in case XMIT_BUSY was still set when * the other caller tried. */ clear_bit(XMIT_BUSY, &ap->xmit_flags); /* any more work to do? if not, exit the loop */ if (!(test_bit(XMIT_WAKEUP, &ap->xmit_flags) || (!tty_stuffed && ap->tpkt))) break; /* more work to do, see if we can do it now */ if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) break; } spin_unlock_bh(&ap->xmit_lock); return done; flush: clear_bit(XMIT_BUSY, &ap->xmit_flags); if (ap->tpkt) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } ap->optr = ap->olim; spin_unlock_bh(&ap->xmit_lock); return done; } /* * Flush output from our internal buffers. * Called for the TCFLSH ioctl. Can be entered in parallel * but this is covered by the xmit_lock. */ static void ppp_async_flush_output(struct asyncppp *ap) { int done = 0; spin_lock_bh(&ap->xmit_lock); ap->optr = ap->olim; if (ap->tpkt != NULL) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } spin_unlock_bh(&ap->xmit_lock); if (done) ppp_output_wakeup(&ap->chan); } /* * Receive-side routines. */ /* see how many ordinary chars there are at the start of buf */ static inline int scan_ordinary(struct asyncppp *ap, const unsigned char *buf, int count) { int i, c; for (i = 0; i < count; ++i) { c = buf[i]; if (c == PPP_ESCAPE || c == PPP_FLAG || (c < 0x20 && (ap->raccm & (1 << c)) != 0)) break; } return i; } /* called when a flag is seen - do end-of-packet processing */ static void process_input_packet(struct asyncppp *ap) { struct sk_buff *skb; unsigned char *p; unsigned int len, fcs; skb = ap->rpkt; if (ap->state & (SC_TOSS | SC_ESCAPE)) goto err; if (skb == NULL) return; /* 0-length packet */ /* check the FCS */ p = skb->data; len = skb->len; if (len < 3) goto err; /* too short */ fcs = PPP_INITFCS; for (; len > 0; --len) fcs = PPP_FCS(fcs, *p++); if (fcs != PPP_GOODFCS) goto err; /* bad FCS */ skb_trim(skb, skb->len - 2); /* check for address/control and protocol compression */ p = skb->data; if (p[0] == PPP_ALLSTATIONS) { /* chop off address/control */ if (p[1] != PPP_UI || skb->len < 3) goto err; p = skb_pull(skb, 2); } /* If protocol field is not compressed, it can be LCP packet */ if (!(p[0] & 0x01)) { unsigned int proto; if (skb->len < 2) goto err; proto = (p[0] << 8) + p[1]; if (proto == PPP_LCP) async_lcp_peek(ap, p, skb->len, 1); } /* queue the frame to be processed */ skb->cb[0] = ap->state; skb_queue_tail(&ap->rqueue, skb); ap->rpkt = NULL; ap->state = 0; return; err: /* frame had an error, remember that, reset SC_TOSS & SC_ESCAPE */ ap->state = SC_PREV_ERROR; if (skb) { /* make skb appear as freshly allocated */ skb_trim(skb, 0); skb_reserve(skb, - skb_headroom(skb)); } } /* Called when the tty driver has data for us. Runs parallel with the other ldisc functions but will not be re-entered */ static void ppp_async_input(struct asyncppp *ap, const u8 *buf, const u8 *flags, int count) { struct sk_buff *skb; int c, i, j, n, s, f; unsigned char *sp; /* update bits used for 8-bit cleanness detection */ if (~ap->rbits & SC_RCV_BITS) { s = 0; for (i = 0; i < count; ++i) { c = buf[i]; if (flags && flags[i] != 0) continue; s |= (c & 0x80)? SC_RCV_B7_1: SC_RCV_B7_0; c = ((c >> 4) ^ c) & 0xf; s |= (0x6996 & (1 << c))? SC_RCV_ODDP: SC_RCV_EVNP; } ap->rbits |= s; } while (count > 0) { /* scan through and see how many chars we can do in bulk */ if ((ap->state & SC_ESCAPE) && buf[0] == PPP_ESCAPE) n = 1; else n = scan_ordinary(ap, buf, count); f = 0; if (flags && (ap->state & SC_TOSS) == 0) { /* check the flags to see if any char had an error */ for (j = 0; j < n; ++j) if ((f = flags[j]) != 0) break; } if (f != 0) { /* start tossing */ ap->state |= SC_TOSS; } else if (n > 0 && (ap->state & SC_TOSS) == 0) { /* stuff the chars in the skb */ skb = ap->rpkt; if (!skb) { skb = dev_alloc_skb(ap->mru + PPP_HDRLEN + 2); if (!skb) goto nomem; ap->rpkt = skb; } if (skb->len == 0) { /* Try to get the payload 4-byte aligned. * This should match the * PPP_ALLSTATIONS/PPP_UI/compressed tests in * process_input_packet, but we do not have * enough chars here to test buf[1] and buf[2]. */ if (buf[0] != PPP_ALLSTATIONS) skb_reserve(skb, 2 + (buf[0] & 1)); } if (n > skb_tailroom(skb)) { /* packet overflowed MRU */ ap->state |= SC_TOSS; } else { sp = skb_put_data(skb, buf, n); if (ap->state & SC_ESCAPE) { sp[0] ^= PPP_TRANS; ap->state &= ~SC_ESCAPE; } } } if (n >= count) break; c = buf[n]; if (flags != NULL && flags[n] != 0) { ap->state |= SC_TOSS; } else if (c == PPP_FLAG) { process_input_packet(ap); } else if (c == PPP_ESCAPE) { ap->state |= SC_ESCAPE; } else if (I_IXON(ap->tty)) { if (c == START_CHAR(ap->tty)) start_tty(ap->tty); else if (c == STOP_CHAR(ap->tty)) stop_tty(ap->tty); } /* otherwise it's a char in the recv ACCM */ ++n; buf += n; if (flags) flags += n; count -= n; } return; nomem: printk(KERN_ERR "PPPasync: no memory (input pkt)\n"); ap->state |= SC_TOSS; } /* * We look at LCP frames going past so that we can notice * and react to the LCP configure-ack from the peer. * In the situation where the peer has been sent a configure-ack * already, LCP is up once it has sent its configure-ack * so the immediately following packet can be sent with the * configured LCP options. This allows us to process the following * packet correctly without pppd needing to respond quickly. * * We only respond to the received configure-ack if we have just * sent a configure-request, and the configure-ack contains the * same data (this is checked using a 16-bit crc of the data). */ #define CONFREQ 1 /* LCP code field values */ #define CONFACK 2 #define LCP_MRU 1 /* LCP option numbers */ #define LCP_ASYNCMAP 2 static void async_lcp_peek(struct asyncppp *ap, unsigned char *data, int len, int inbound) { int dlen, fcs, i, code; u32 val; data += 2; /* skip protocol bytes */ len -= 2; if (len < 4) /* 4 = code, ID, length */ return; code = data[0]; if (code != CONFACK && code != CONFREQ) return; dlen = get_unaligned_be16(data + 2); if (len < dlen) return; /* packet got truncated or length is bogus */ if (code == (inbound? CONFACK: CONFREQ)) { /* * sent confreq or received confack: * calculate the crc of the data from the ID field on. */ fcs = PPP_INITFCS; for (i = 1; i < dlen; ++i) fcs = PPP_FCS(fcs, data[i]); if (!inbound) { /* outbound confreq - remember the crc for later */ ap->lcp_fcs = fcs; return; } /* received confack, check the crc */ fcs ^= ap->lcp_fcs; ap->lcp_fcs = -1; if (fcs != 0) return; } else if (inbound) return; /* not interested in received confreq */ /* process the options in the confack */ data += 4; dlen -= 4; /* data[0] is code, data[1] is length */ while (dlen >= 2 && dlen >= data[1] && data[1] >= 2) { switch (data[0]) { case LCP_MRU: val = get_unaligned_be16(data + 2); if (inbound) ap->mru = val; else ap->chan.mtu = val; break; case LCP_ASYNCMAP: val = get_unaligned_be32(data + 2); if (inbound) ap->raccm = val; else ap->xaccm[0] = val; break; } dlen -= data[1]; data += data[1]; } } static void __exit ppp_async_cleanup(void) { tty_unregister_ldisc(&ppp_ldisc); } module_init(ppp_async_init); module_exit(ppp_async_cleanup);
498 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfsrvl *service = container_obj(layr); if (layr->up == NULL || layr->up->ctrlcmd == NULL) return; switch (ctrl) { case CAIF_CTRLCMD_INIT_RSP: service->open = true; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case CAIF_CTRLCMD_DEINIT_RSP: case CAIF_CTRLCMD_INIT_FAIL_RSP: service->open = false; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: if (phyid != service->dev_info.id) break; if (service->modem_flow_on) layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); service->phy_flow_on = false; break; case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: if (phyid != service->dev_info.id) return; if (service->modem_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->phy_flow_on = true; break; case CAIF_CTRLCMD_FLOW_OFF_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); } service->modem_flow_on = false; break; case CAIF_CTRLCMD_FLOW_ON_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->modem_flow_on = true; break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: /* In case interface is down, let's fake a remove shutdown */ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; break; } } static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) { struct cfsrvl *service = container_obj(layr); caif_assert(layr != NULL); caif_assert(layr->dn != NULL); caif_assert(layr->dn->transmit != NULL); if (!service->supports_flowctrl) return 0; switch (ctrl) { case CAIF_MODEMCMD_FLOW_ON_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } case CAIF_MODEMCMD_FLOW_OFF_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } default: break; } return -EINVAL; } static void cfsrvl_release(struct cflayer *layer) { struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); kfree(service); } void cfsrvl_init(struct cfsrvl *service, u8 channel_id, struct dev_info *dev_info, bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; service->modem_flow_on = true; service->phy_flow_on = true; service->layer.id = channel_id; service->layer.ctrlcmd = cfservl_ctrlcmd; service->layer.modemcmd = cfservl_modemcmd; service->dev_info = *dev_info; service->supports_flowctrl = supports_flowctrl; service->release = cfsrvl_release; } bool cfsrvl_ready(struct cfsrvl *service, int *err) { if (!service->open) { *err = -ENOTCONN; return false; } return true; } bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id == phyid; } void caif_free_client(struct cflayer *adap_layer) { struct cfsrvl *servl; if (adap_layer == NULL || adap_layer->dn == NULL) return; servl = container_obj(adap_layer->dn); servl->release(&servl->layer); } EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*hold)(struct cflayer *lyr), void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) return; service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } EXPORT_SYMBOL(caif_client_register_refcnt);
77 5 77 77 387 181 44 549 549 384 138 142 548 328 329 328 4 8 12 4 2 1 1 162 189 1 1 36 246 1 245 246 1 245 8 190 246 1 141 104 245 1 111 137 563 558 7 5 2 17 10 16 3 54 30 30 77 77 296 202 215 21 60 29 36 35 2 37 121 1 110 1 76 35 51 57 393 33 330 45 316 329 64 303 40 230 11 262 37 273 330 1 329 2 331 330 1 1 17 2 15 7 6 346 36 295 1 99 134 134 21 269 269 269 63 48 99 99 5 94 68 29 345 344 210 135 346 71 38 5 48 63 823 239 726 108 107 108 100 10 108 108 108 48 63 107 48 63 108 133 51 108 108 181 181 89 111 181 181 156 156 142 63 63 156 111 181 181 48 51 111 2 331 2 329 328 331 99 79 24 269 332 5 2 331 25 22 5 315 21 21 25 257 12 78 4 74 314 4 18 18 18 158 159 159 1 1 154 2 38 1 37 12 1 10 1 3 1 1 1 172 173 2 2 20 2 19 17 16 1 1 1 2 3 1 1 2 1 1 2 1 1 4 2 2 1014 258 1267 1269 1150 1 4 2 173 137 2 1 1 1 1 2 2 19 4 8 2 17 20 768 11 27 197 547 1 225 345 346 258 211 47 331 331 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles * or rs-channels. It also implements echoing, cooked mode etc. * * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. * * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the * tty_struct and tty_queue structures. Previously there was an array * of 256 tty_struct's which was statically allocated, and the * tty_queue structures were allocated at boot time. Both are now * dynamically allocated only when the tty is open. * * Also restructured routines so that there is more of a separation * between the high-level tty routines (tty_io.c and tty_ioctl.c) and * the low-level tty routines (serial.c, pty.c, console.c). This * makes for cleaner and more compact code. -TYT, 9/17/92 * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). * * NOTE: pay no attention to the line discipline code (yet); its * interface is still subject to change in this version... * -- TYT, 1/31/92 * * Added functionality to the OPOST tty handling. No delays, but all * other bits should be there. * -- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993. * * Rewrote canonical mode and added more termios flags. * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 * * Reorganized FASYNC support so mouse code can share it. * -- ctm@ardi.com, 9Sep95 * * New TIOCLINUX variants added. * -- mj@k332.feld.cvut.cz, 19-Nov-95 * * Restrict vt switching via ioctl() * -- grif@cs.ucr.edu, 5-Dec-95 * * Move console and virtual terminal code to more appropriate files, * implement CONFIG_VT and generalize console device interface. * -- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97 * * Rewrote tty_init_dev and tty_release_dev to eliminate races. * -- Bill Hawes <whawes@star.net>, June 97 * * Added devfs support. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998 * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * * Reduced memory usage for older ARM systems * -- Russell King <rmk@arm.linux.org.uk> * * Move do_SAK() into process context. Less stack use in devfs functions. * alloc_tty_struct() always uses kmalloc() * -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01 */ #include <linux/types.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/devpts_fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/console.h> #include <linux/timer.h> #include <linux/ctype.h> #include <linux/kd.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/ppp-ioctl.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/seq_file.h> #include <linux/serial.h> #include <linux/ratelimit.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/termios_internal.h> #include <linux/fs.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kmod.h> #include <linux/nsproxy.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do { } while (0) #endif #define TTY_PARANOIA_CHECK 1 #define CHECK_TTY_COUNT 1 struct ktermios tty_std_termios = { /* for the benefit of tty drivers */ .c_iflag = ICRNL | IXON, .c_oflag = OPOST | ONLCR, .c_cflag = B38400 | CS8 | CREAD | HUPCL, .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN, .c_cc = INIT_C_CC, .c_ispeed = 38400, .c_ospeed = 38400, /* .c_line = N_TTY, */ }; EXPORT_SYMBOL(tty_std_termios); /* This list gets poked at by procfs and various bits of boot up code. This * could do with some rationalisation such as pulling the tty proc function * into this file. */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ /* Mutex to protect creating and releasing a tty */ DEFINE_MUTEX(tty_mutex); static ssize_t tty_read(struct kiocb *, struct iov_iter *); static ssize_t tty_write(struct kiocb *, struct iov_iter *); static __poll_t tty_poll(struct file *, poll_table *); static int tty_open(struct inode *, struct file *); #ifdef CONFIG_COMPAT static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define tty_compat_ioctl NULL #endif static int __tty_fasync(int fd, struct file *filp, int on); static int tty_fasync(int fd, struct file *filp, int on); static void release_tty(struct tty_struct *tty, int idx); /** * free_tty_struct - free a disused tty * @tty: tty struct to free * * Free the write buffers, tty queue and tty memory itself. * * Locking: none. Must be called after tty is definitely unused */ static void free_tty_struct(struct tty_struct *tty) { tty_ldisc_deinit(tty); put_device(tty->dev); kvfree(tty->write_buf); kfree(tty); } static inline struct tty_struct *file_tty(struct file *file) { return ((struct tty_file_private *)file->private_data)->tty; } int tty_alloc_file(struct file *file) { struct tty_file_private *priv; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; file->private_data = priv; return 0; } /* Associate a new file with the tty structure */ void tty_add_file(struct tty_struct *tty, struct file *file) { struct tty_file_private *priv = file->private_data; priv->tty = tty; priv->file = file; spin_lock(&tty->files_lock); list_add(&priv->list, &tty->tty_files); spin_unlock(&tty->files_lock); } /** * tty_free_file - free file->private_data * @file: to free private_data of * * This shall be used only for fail path handling when tty_add_file was not * called yet. */ void tty_free_file(struct file *file) { struct tty_file_private *priv = file->private_data; file->private_data = NULL; kfree(priv); } /* Delete file from its tty */ static void tty_del_file(struct file *file) { struct tty_file_private *priv = file->private_data; struct tty_struct *tty = priv->tty; spin_lock(&tty->files_lock); list_del(&priv->list); spin_unlock(&tty->files_lock); tty_free_file(file); } /** * tty_name - return tty naming * @tty: tty structure * * Convert a tty structure into a name. The name reflects the kernel naming * policy and if udev is in use may not reflect user space * * Locking: none */ const char *tty_name(const struct tty_struct *tty) { if (!tty) /* Hmm. NULL pointer. That's fun. */ return "NULL tty"; return tty->name; } EXPORT_SYMBOL(tty_name); const char *tty_driver_name(const struct tty_struct *tty) { if (!tty || !tty->driver) return ""; return tty->driver->name; } static int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK if (!tty) { pr_warn("(%d:%d): %s: NULL tty\n", imajor(inode), iminor(inode), routine); return 1; } #endif return 0; } /* Caller must hold tty_lock */ static void check_tty_count(struct tty_struct *tty, const char *routine) { #ifdef CHECK_TTY_COUNT struct list_head *p; int count = 0, kopen_count = 0; scoped_guard(spinlock, &tty->files_lock) list_for_each(p, &tty->tty_files) count++; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) count++; if (tty_port_kopened(tty->port)) kopen_count++; if (tty->count != (count + kopen_count)) { tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n", routine, tty->count, count, kopen_count); } #endif } /** * get_tty_driver - find device of a tty * @device: device identifier * @index: returns the index of the tty * * This routine returns a tty driver structure, given a device number and also * passes back the index number. * * Locking: caller must hold tty_mutex */ static struct tty_driver *get_tty_driver(dev_t device, int *index) { struct tty_driver *p; list_for_each_entry(p, &tty_drivers, tty_drivers) { dev_t base = MKDEV(p->major, p->minor_start); if (device < base || device >= base + p->num) continue; *index = device - base; return tty_driver_kref_get(p); } return NULL; } /** * tty_dev_name_to_number - return dev_t for device name * @name: user space name of device under /dev * @number: pointer to dev_t that this function will populate * * This function converts device names like ttyS0 or ttyUSB1 into dev_t like * (4, 64) or (188, 1). If no corresponding driver is registered then the * function returns -%ENODEV. * * Locking: this acquires tty_mutex to protect the tty_drivers list from * being modified while we are traversing it, and makes sure to * release it before exiting. */ int tty_dev_name_to_number(const char *name, dev_t *number) { struct tty_driver *p; int ret; int index, prefix_length = 0; const char *str; for (str = name; *str && !isdigit(*str); str++) ; if (!*str) return -EINVAL; ret = kstrtoint(str, 10, &index); if (ret) return ret; prefix_length = str - name; guard(mutex)(&tty_mutex); list_for_each_entry(p, &tty_drivers, tty_drivers) if (prefix_length == strlen(p->name) && strncmp(name, p->name, prefix_length) == 0) { if (index < p->num) { *number = MKDEV(p->major, p->minor_start + index); return 0; } } return -ENODEV; } EXPORT_SYMBOL_GPL(tty_dev_name_to_number); #ifdef CONFIG_CONSOLE_POLL /** * tty_find_polling_driver - find device of a polled tty * @name: name string to match * @line: pointer to resulting tty line nr * * This routine returns a tty driver structure, given a name and the condition * that the tty driver is capable of polled operation. */ struct tty_driver *tty_find_polling_driver(char *name, int *line) { struct tty_driver *p; int tty_line = 0; int len; char *str, *stp; for (str = name; *str; str++) if ((*str >= '0' && *str <= '9') || *str == ',') break; if (!*str) return NULL; len = str - name; tty_line = simple_strtoul(str, &str, 10); guard(mutex)(&tty_mutex); /* Search through the tty devices to look for a match */ list_for_each_entry(p, &tty_drivers, tty_drivers) { if (!len || strncmp(name, p->name, len) != 0) continue; stp = str; if (*stp == ',') stp++; if (*stp == '\0') stp = NULL; if (tty_line >= 0 && tty_line < p->num && p->ops && p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) { *line = tty_line; return tty_driver_kref_get(p); } } return NULL; } EXPORT_SYMBOL_GPL(tty_find_polling_driver); #endif static ssize_t hung_up_tty_read(struct kiocb *iocb, struct iov_iter *to) { return 0; } static ssize_t hung_up_tty_write(struct kiocb *iocb, struct iov_iter *from) { return -EIO; } /* No kernel lock held - none needed ;) */ static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait) { return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM; } static long hung_up_tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static long hung_up_tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static int hung_up_tty_fasync(int fd, struct file *file, int on) { return -ENOTTY; } static void tty_show_fdinfo(struct seq_file *m, struct file *file) { struct tty_struct *tty = file_tty(file); if (tty && tty->ops && tty->ops->show_fdinfo) tty->ops->show_fdinfo(tty, m); } static const struct file_operations tty_fops = { .read_iter = tty_read, .write_iter = tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, .show_fdinfo = tty_show_fdinfo, }; static const struct file_operations console_fops = { .read_iter = tty_read, .write_iter = redirected_tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, }; static const struct file_operations hung_up_tty_fops = { .read_iter = hung_up_tty_read, .write_iter = hung_up_tty_write, .poll = hung_up_tty_poll, .unlocked_ioctl = hung_up_tty_ioctl, .compat_ioctl = hung_up_tty_compat_ioctl, .release = tty_release, .fasync = hung_up_tty_fasync, }; static DEFINE_SPINLOCK(redirect_lock); static struct file *redirect; /** * tty_wakeup - request more data * @tty: terminal * * Internal and external helper for wakeups of tty. This function informs the * line discipline if present that the driver is ready to receive more output * data. */ void tty_wakeup(struct tty_struct *tty) { struct tty_ldisc *ld; if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->write_wakeup) ld->ops->write_wakeup(tty); tty_ldisc_deref(ld); } } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } EXPORT_SYMBOL_GPL(tty_wakeup); /** * tty_release_redirect - Release a redirect on a pty if present * @tty: tty device * * This is available to the pty code so if the master closes, if the slave is a * redirect it can release the redirect. */ static struct file *tty_release_redirect(struct tty_struct *tty) { guard(spinlock)(&redirect_lock); if (redirect && file_tty(redirect) == tty) { struct file *f = redirect; redirect = NULL; return f; } return NULL; } /** * __tty_hangup - actual handler for hangup events * @tty: tty device * @exit_session: if non-zero, signal all foreground group processes * * This can be called by a "kworker" kernel thread. That is process synchronous * but doesn't hold any locks, so we need to make sure we have the appropriate * locks for what we're doing. * * The hangup event clears any pending redirections onto the hung up device. It * ensures future writes will error and it does the needed line discipline * hangup and signal delivery. The tty object itself remains intact. * * Locking: * * BTM * * * redirect lock for undoing redirection * * file list lock for manipulating list of ttys * * tty_ldiscs_lock from called functions * * termios_rwsem resetting termios data * * tasklist_lock to walk task list for hangup event * * * ->siglock to protect ->signal/->sighand * */ static void __tty_hangup(struct tty_struct *tty, int exit_session) { struct file *cons_filp = NULL; struct file *filp, *f; struct tty_file_private *priv; int closecount = 0, n; int refs; if (!tty) return; f = tty_release_redirect(tty); tty_lock(tty); if (test_bit(TTY_HUPPED, &tty->flags)) { tty_unlock(tty); return; } /* * Some console devices aren't actually hung up for technical and * historical reasons, which can lead to indefinite interruptible * sleep in n_tty_read(). The following explicitly tells * n_tty_read() to abort readers. */ set_bit(TTY_HUPPING, &tty->flags); /* inuse_filps is protected by the single tty lock, * this really needs to change if we want to flush the * workqueue with the lock held. */ check_tty_count(tty, "tty_hangup"); spin_lock(&tty->files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(priv, &tty->tty_files, list) { filp = priv->file; if (filp->f_op->write_iter == redirected_tty_write) cons_filp = filp; if (filp->f_op->write_iter != tty_write) continue; closecount++; __tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } spin_unlock(&tty->files_lock); refs = tty_signal_session_leader(tty, exit_session); /* Account for the p->signal references we killed */ while (refs--) tty_kref_put(tty); tty_ldisc_hangup(tty, cons_filp != NULL); spin_lock_irq(&tty->ctrl.lock); clear_bit(TTY_THROTTLED, &tty->flags); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); put_pid(tty->ctrl.session); put_pid(tty->ctrl.pgrp); tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; tty->ctrl.pktstatus = 0; spin_unlock_irq(&tty->ctrl.lock); /* * If one of the devices matches a console pointer, we * cannot just call hangup() because that will cause * tty->count and state->count to go out of sync. * So we just call close() the right number of times. */ if (cons_filp) { if (tty->ops->close) for (n = 0; n < closecount; n++) tty->ops->close(tty, cons_filp); } else if (tty->ops->hangup) tty->ops->hangup(tty); /* * We don't want to have driver/ldisc interactions beyond the ones * we did here. The driver layer expects no calls after ->hangup() * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) fput(f); } static void do_tty_hangup(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); __tty_hangup(tty, 0); } /** * tty_hangup - trigger a hangup event * @tty: tty to hangup * * A carrier loss (virtual or otherwise) has occurred on @tty. Schedule a * hangup sequence to run after this event. */ void tty_hangup(struct tty_struct *tty) { tty_debug_hangup(tty, "hangup\n"); schedule_work(&tty->hangup_work); } EXPORT_SYMBOL(tty_hangup); /** * tty_vhangup - process vhangup * @tty: tty to hangup * * The user has asked via system call for the terminal to be hung up. We do * this synchronously so that when the syscall returns the process is complete. * That guarantee is necessary for security reasons. */ void tty_vhangup(struct tty_struct *tty) { tty_debug_hangup(tty, "vhangup\n"); __tty_hangup(tty, 0); } EXPORT_SYMBOL(tty_vhangup); /** * tty_vhangup_self - process vhangup for own ctty * * Perform a vhangup on the current controlling tty */ void tty_vhangup_self(void) { struct tty_struct *tty; tty = get_current_tty(); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } } /** * tty_vhangup_session - hangup session leader exit * @tty: tty to hangup * * The session leader is exiting and hanging up its controlling terminal. * Every process in the foreground process group is signalled %SIGHUP. * * We do this synchronously so that when the syscall returns the process is * complete. That guarantee is necessary for security reasons. */ void tty_vhangup_session(struct tty_struct *tty) { tty_debug_hangup(tty, "session hangup\n"); __tty_hangup(tty, 1); } /** * tty_hung_up_p - was tty hung up * @filp: file pointer of tty * * Return: true if the tty has been subject to a vhangup or a carrier loss */ int tty_hung_up_p(struct file *filp) { return (filp && filp->f_op == &hung_up_tty_fops); } EXPORT_SYMBOL(tty_hung_up_p); void __stop_tty(struct tty_struct *tty) { if (tty->flow.stopped) return; tty->flow.stopped = true; if (tty->ops->stop) tty->ops->stop(tty); } /** * stop_tty - propagate flow control * @tty: tty to stop * * Perform flow control to the driver. May be called on an already stopped * device and will not re-call the &tty_driver->stop() method. * * This functionality is used by both the line disciplines for halting incoming * flow and by the driver. It may therefore be called from any context, may be * under the tty %atomic_write_lock but not always. * * Locking: * flow.lock */ void stop_tty(struct tty_struct *tty) { guard(spinlock_irqsave)(&tty->flow.lock); __stop_tty(tty); } EXPORT_SYMBOL(stop_tty); void __start_tty(struct tty_struct *tty) { if (!tty->flow.stopped || tty->flow.tco_stopped) return; tty->flow.stopped = false; if (tty->ops->start) tty->ops->start(tty); tty_wakeup(tty); } /** * start_tty - propagate flow control * @tty: tty to start * * Start a tty that has been stopped if at all possible. If @tty was previously * stopped and is now being started, the &tty_driver->start() method is invoked * and the line discipline woken. * * Locking: * flow.lock */ void start_tty(struct tty_struct *tty) { guard(spinlock_irqsave)(&tty->flow.lock); __start_tty(tty); } EXPORT_SYMBOL(start_tty); static void tty_update_time(struct tty_struct *tty, bool mtime) { time64_t sec = ktime_get_real_seconds(); struct tty_file_private *priv; guard(spinlock)(&tty->files_lock); list_for_each_entry(priv, &tty->tty_files, list) { struct inode *inode = file_inode(priv->file); struct timespec64 time = mtime ? inode_get_mtime(inode) : inode_get_atime(inode); /* * We only care if the two values differ in anything other than the * lower three bits (i.e every 8 seconds). If so, then we can update * the time of the tty device, otherwise it could be construded as a * security leak to let userspace know the exact timing of the tty. */ if ((sec ^ time.tv_sec) & ~7) { if (mtime) inode_set_mtime(inode, sec, 0); else inode_set_atime(inode, sec, 0); } } } /* * Iterate on the ldisc ->read() function until we've gotten all * the data the ldisc has for us. * * The "cookie" is something that the ldisc read function can fill * in to let us know that there is more data to be had. * * We promise to continue to call the ldisc until it stops returning * data or clears the cookie. The cookie may be something that the * ldisc maintains state for and needs to free. */ static ssize_t iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *to) { void *cookie = NULL; unsigned long offset = 0; ssize_t retval = 0; size_t copied, count = iov_iter_count(to); u8 kernel_buf[64]; do { ssize_t size = min(count, sizeof(kernel_buf)); size = ld->ops->read(tty, file, kernel_buf, size, &cookie, offset); if (!size) break; if (size < 0) { /* Did we have an earlier error (ie -EFAULT)? */ if (retval) break; retval = size; /* * -EOVERFLOW means we didn't have enough space * for a whole packet, and we shouldn't return * a partial result. */ if (retval == -EOVERFLOW) offset = 0; break; } copied = copy_to_iter(kernel_buf, size, to); offset += copied; count -= copied; /* * If the user copy failed, we still need to do another ->read() * call if we had a cookie to let the ldisc clear up. * * But make sure size is zeroed. */ if (unlikely(copied != size)) { count = 0; retval = -EFAULT; } } while (cookie); /* We always clear tty buffer in case they contained passwords */ memzero_explicit(kernel_buf, sizeof(kernel_buf)); return offset ? offset : retval; } /** * tty_read - read method for tty device files * @iocb: kernel I/O control block * @to: destination for the data read * * Perform the read system call function on this terminal device. Checks * for hung up devices before calling the line discipline method. * * Locking: * Locks the line discipline internally while needed. Multiple read calls * may be outstanding in parallel. */ static ssize_t tty_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, inode, "tty_read")) return -EIO; if (!tty || tty_io_error(tty)) return -EIO; /* We want to wait for the line discipline to sort out in this * situation. */ ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_read(iocb, to); ret = -EIO; if (ld->ops->read) ret = iterate_tty_read(ld, tty, file, to); tty_ldisc_deref(ld); if (ret > 0) tty_update_time(tty, false); return ret; } void tty_write_unlock(struct tty_struct *tty) { mutex_unlock(&tty->atomic_write_lock); wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } int tty_write_lock(struct tty_struct *tty, bool ndelay) { if (!mutex_trylock(&tty->atomic_write_lock)) { if (ndelay) return -EAGAIN; if (mutex_lock_interruptible(&tty->atomic_write_lock)) return -ERESTARTSYS; } return 0; } /* * Split writes up in sane blocksizes to avoid * denial-of-service type attacks */ static ssize_t iterate_tty_write(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *from) { size_t chunk, count = iov_iter_count(from); ssize_t ret, written = 0; ret = tty_write_lock(tty, file->f_flags & O_NDELAY); if (ret < 0) return ret; /* * We chunk up writes into a temporary buffer. This * simplifies low-level drivers immensely, since they * don't have locking issues and user mode accesses. * * But if TTY_NO_WRITE_SPLIT is set, we should use a * big chunk-size.. * * The default chunk-size is 2kB, because the NTTY * layer has problems with bigger chunks. It will * claim to be able to handle more characters than * it actually does. */ chunk = 2048; if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) chunk = 65536; if (count < chunk) chunk = count; /* write_buf/write_cnt is protected by the atomic_write_lock mutex */ if (tty->write_cnt < chunk) { u8 *buf_chunk; if (chunk < 1024) chunk = 1024; buf_chunk = kvmalloc(chunk, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!buf_chunk) { ret = -ENOMEM; goto out; } kvfree(tty->write_buf); tty->write_cnt = chunk; tty->write_buf = buf_chunk; } /* Do the write .. */ for (;;) { size_t size = min(chunk, count); ret = -EFAULT; if (copy_from_iter(tty->write_buf, size, from) != size) break; ret = ld->ops->write(tty, file, tty->write_buf, size); if (ret <= 0) break; written += ret; if (ret > size) break; /* FIXME! Have Al check this! */ if (ret != size) iov_iter_revert(from, size-ret); count -= ret; if (!count) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; cond_resched(); } if (written) { tty_update_time(tty, true); ret = written; } out: tty_write_unlock(tty); return ret; } #ifdef CONFIG_PRINT_QUOTA_WARNING /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct * @msg: the message to write * * This is used for messages that need to be redirected to a specific tty. We * don't put it into the syslog queue right now maybe in the future if really * needed. * * We must still hold the BTM and test the CLOSING flag for the moment. * * This function is DEPRECATED, do not use in new code. */ void tty_write_message(struct tty_struct *tty, char *msg) { if (tty) { mutex_lock(&tty->atomic_write_lock); tty_lock(tty); if (tty->ops->write && tty->count > 0) tty->ops->write(tty, msg, strlen(msg)); tty_unlock(tty); tty_write_unlock(tty); } } #endif static ssize_t file_tty_write(struct file *file, struct kiocb *iocb, struct iov_iter *from) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, file_inode(file), "tty_write")) return -EIO; if (!tty || !tty->ops->write || tty_io_error(tty)) return -EIO; /* Short term debug to catch buggy drivers */ if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_write(iocb, from); if (!ld->ops->write) ret = -EIO; else ret = iterate_tty_write(ld, tty, file, from); tty_ldisc_deref(ld); return ret; } /** * tty_write - write method for tty device file * @iocb: kernel I/O control block * @from: iov_iter with data to write * * Write data to a tty device via the line discipline. * * Locking: * Locks the line discipline as required * Writes to the tty driver are serialized by the atomic_write_lock * and are then processed in chunks to the device. The line * discipline write method will not be invoked in parallel for * each device. */ static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) { return file_tty_write(iocb->ki_filp, iocb, from); } ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *p = NULL; spin_lock(&redirect_lock); if (redirect) p = get_file(redirect); spin_unlock(&redirect_lock); /* * We know the redirected tty is just another tty, we can * call file_tty_write() directly with that file pointer. */ if (p) { ssize_t res; res = file_tty_write(p, iocb, iter); fput(p); return res; } return tty_write(iocb, iter); } /** * tty_send_xchar - send priority character * @tty: the tty to send to * @ch: xchar to send * * Send a high priority character to the tty even if stopped. * * Locking: none for xchar method, write ordering for write method. */ int tty_send_xchar(struct tty_struct *tty, u8 ch) { bool was_stopped = tty->flow.stopped; if (tty->ops->send_xchar) { down_read(&tty->termios_rwsem); tty->ops->send_xchar(tty, ch); up_read(&tty->termios_rwsem); return 0; } if (tty_write_lock(tty, false) < 0) return -ERESTARTSYS; down_read(&tty->termios_rwsem); if (was_stopped) start_tty(tty); tty->ops->write(tty, &ch, 1); if (was_stopped) stop_tty(tty); up_read(&tty->termios_rwsem); tty_write_unlock(tty); return 0; } /** * pty_line_name - generate name for a pty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 6 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static void pty_line_name(struct tty_driver *driver, int index, char *p) { static const char ptychar[] = "pqrstuvwxyzabcde"; int i = index + driver->name_base; /* ->name is initialized to "ttyp", but "tty" is expected */ sprintf(p, "%s%c%x", driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name, ptychar[i >> 4 & 0xf], i & 0xf); } /** * tty_line_name - generate name for a tty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 7 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p) { if (driver->flags & TTY_DRIVER_UNNUMBERED_NODE) return sprintf(p, "%s", driver->name); else return sprintf(p, "%s%d", driver->name, index + driver->name_base); } /** * tty_driver_lookup_tty() - find an existing tty, if any * @driver: the driver for the tty * @file: file object * @idx: the minor number * * Return: the tty, if found. If not found, return %NULL or ERR_PTR() if the * driver lookup() method returns an error. * * Locking: tty_mutex must be held. If the tty is found, bump the tty kref. */ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; if (driver->ops->lookup) { if (!file) tty = ERR_PTR(-EIO); else tty = driver->ops->lookup(driver, file, idx); } else { if (idx >= driver->num) return ERR_PTR(-EINVAL); tty = driver->ttys[idx]; } if (!IS_ERR(tty)) tty_kref_get(tty); return tty; } /** * tty_init_termios - helper for termios setup * @tty: the tty to set up * * Initialise the termios structure for this tty. This runs under the * %tty_mutex currently so we can be relaxed about ordering. */ void tty_init_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty->termios = tty->driver->init_termios; else { /* Check for lazy saved data */ tp = tty->driver->termios[idx]; if (tp != NULL) { tty->termios = *tp; tty->termios.c_line = tty->driver->init_termios.c_line; } else tty->termios = tty->driver->init_termios; } /* Compatibility until drivers always set this */ tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); } EXPORT_SYMBOL_GPL(tty_init_termios); /** * tty_standard_install - usual tty->ops->install * @driver: the driver for the tty * @tty: the tty * * If the @driver overrides @tty->ops->install, it still can call this function * to perform the standard install operations. */ int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty) { tty_init_termios(tty); tty_driver_kref_get(driver); tty->count++; driver->ttys[tty->index] = tty; return 0; } EXPORT_SYMBOL_GPL(tty_standard_install); /** * tty_driver_install_tty() - install a tty entry in the driver * @driver: the driver for the tty * @tty: the tty * * Install a tty object into the driver tables. The @tty->index field will be * set by the time this is called. This method is responsible for ensuring any * need additional structures are allocated and configured. * * Locking: tty_mutex for now */ static int tty_driver_install_tty(struct tty_driver *driver, struct tty_struct *tty) { return driver->ops->install ? driver->ops->install(driver, tty) : tty_standard_install(driver, tty); } /** * tty_driver_remove_tty() - remove a tty from the driver tables * @driver: the driver for the tty * @tty: tty to remove * * Remove a tty object from the driver tables. The tty->index field will be set * by the time this is called. * * Locking: tty_mutex for now */ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty) { if (driver->ops->remove) driver->ops->remove(driver, tty); else driver->ttys[tty->index] = NULL; } /** * tty_reopen() - fast re-open of an open tty * @tty: the tty to open * * Re-opens on master ptys are not allowed and return -%EIO. * * Locking: Caller must hold tty_lock * Return: 0 on success, -errno on error. */ static int tty_reopen(struct tty_struct *tty) { struct tty_driver *driver = tty->driver; struct tty_ldisc *ld; int retval = 0; if (driver->type == TTY_DRIVER_TYPE_PTY && driver->subtype == PTY_TYPE_MASTER) return -EIO; if (!tty->count) return -EAGAIN; if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) return -EBUSY; ld = tty_ldisc_ref_wait(tty); if (ld) { tty_ldisc_deref(ld); } else { retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) return retval; if (!tty->ldisc) retval = tty_ldisc_reinit(tty, tty->termios.c_line); tty_ldisc_unlock(tty); } if (retval == 0) tty->count++; return retval; } /** * tty_init_dev - initialise a tty device * @driver: tty driver we are opening a device on * @idx: device index * * Prepare a tty device. This may not be a "new" clean device but could also be * an active device. The pty drivers require special handling because of this. * * Locking: * The function is called under the tty_mutex, which protects us from the * tty struct or driver itself going away. * * On exit the tty device has the line discipline attached and a reference * count of 1. If a pair was created for pty/tty use and the other was a pty * master then it too has a reference count of 1. * * WSH 06/09/97: Rewritten to remove races and properly clean up after a failed * open. The new code protects the open with a mutex, so it's really quite * straightforward. The mutex locking can probably be relaxed for the (most * common) case of reopening a tty. * * Return: new tty structure */ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) { struct tty_struct *tty; int retval; /* * First time open is complex, especially for PTY devices. * This code guarantees that either everything succeeds and the * TTY is ready for operation, or else the table slots are vacated * and the allocated memory released. (Except that the termios * may be retained.) */ if (!try_module_get(driver->owner)) return ERR_PTR(-ENODEV); tty = alloc_tty_struct(driver, idx); if (!tty) { retval = -ENOMEM; goto err_module_put; } tty_lock(tty); retval = tty_driver_install_tty(driver, tty); if (retval < 0) goto err_free_tty; if (!tty->port) tty->port = driver->ports[idx]; if (WARN_RATELIMIT(!tty->port, "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", __func__, tty->driver->name)) { retval = -EINVAL; goto err_release_lock; } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err_release_lock; tty->port->itty = tty; /* * Structures all installed ... call the ldisc open routines. * If we fail here just call release_tty to clean up. No need * to decrement the use counts, as release_tty doesn't care. */ retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; tty_ldisc_unlock(tty); /* Return the tty locked so that it cannot vanish under the caller */ return tty; err_free_tty: tty_unlock(tty); free_tty_struct(tty); err_module_put: module_put(driver->owner); return ERR_PTR(retval); /* call the tty release_tty routine to clean out this slot */ err_release_tty: tty_ldisc_unlock(tty); tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n", retval, idx); err_release_lock: tty_unlock(tty); release_tty(tty, idx); return ERR_PTR(retval); } /** * tty_save_termios() - save tty termios data in driver table * @tty: tty whose termios data to save * * Locking: Caller guarantees serialisation with tty_init_termios(). */ void tty_save_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; /* If the port is going to reset then it has no termios to save */ if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) return; /* Stash the termios data */ tp = tty->driver->termios[idx]; if (tp == NULL) { tp = kmalloc(sizeof(*tp), GFP_KERNEL); if (tp == NULL) return; tty->driver->termios[idx] = tp; } *tp = tty->termios; } EXPORT_SYMBOL_GPL(tty_save_termios); /** * tty_flush_works - flush all works of a tty/pty pair * @tty: tty device to flush works for (or either end of a pty pair) * * Sync flush all works belonging to @tty (and the 'other' tty). */ static void tty_flush_works(struct tty_struct *tty) { flush_work(&tty->SAK_work); flush_work(&tty->hangup_work); if (tty->link) { flush_work(&tty->link->SAK_work); flush_work(&tty->link->hangup_work); } } /** * release_one_tty - release tty structure memory * @work: work of tty we are obliterating * * Releases memory associated with a tty structure, and clears out the * driver table slots. This function is called when a device is no longer * in use. It also gets called when setup of a device fails. * * Locking: * takes the file list lock internally when working on the list of ttys * that the driver keeps. * * This method gets called from a work queue so that the driver private * cleanup ops can sleep (needed for USB at least) */ static void release_one_tty(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); struct tty_driver *driver = tty->driver; struct module *owner = driver->owner; if (tty->ops->cleanup) tty->ops->cleanup(tty); tty_driver_kref_put(driver); module_put(owner); spin_lock(&tty->files_lock); list_del_init(&tty->tty_files); spin_unlock(&tty->files_lock); put_pid(tty->ctrl.pgrp); put_pid(tty->ctrl.session); free_tty_struct(tty); } static void queue_release_one_tty(struct kref *kref) { struct tty_struct *tty = container_of(kref, struct tty_struct, kref); /* The hangup queue is now free so we can reuse it rather than * waste a chunk of memory for each port. */ INIT_WORK(&tty->hangup_work, release_one_tty); schedule_work(&tty->hangup_work); } /** * tty_kref_put - release a tty kref * @tty: tty device * * Release a reference to the @tty device and if need be let the kref layer * destruct the object for us. */ void tty_kref_put(struct tty_struct *tty) { if (tty) kref_put(&tty->kref, queue_release_one_tty); } EXPORT_SYMBOL(tty_kref_put); /** * release_tty - release tty structure memory * @tty: tty device release * @idx: index of the tty device release * * Release both @tty and a possible linked partner (think pty pair), * and decrement the refcount of the backing module. * * Locking: * tty_mutex * takes the file list lock internally when working on the list of ttys * that the driver keeps. */ static void release_tty(struct tty_struct *tty, int idx) { /* This should always be true but check for the moment */ WARN_ON(tty->index != idx); WARN_ON(!mutex_is_locked(&tty_mutex)); if (tty->ops->shutdown) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); if (tty->port) tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; if (tty->port) tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); tty_kref_put(tty->link); tty_kref_put(tty); } /** * tty_release_checks - check a tty before real release * @tty: tty to check * @idx: index of the tty * * Performs some paranoid checking before true release of the @tty. This is a * no-op unless %TTY_PARANOIA_CHECK is defined. */ static int tty_release_checks(struct tty_struct *tty, int idx) { #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { tty_debug(tty, "bad idx %d\n", idx); return -1; } /* not much to check for devpts */ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) return 0; if (tty != tty->driver->ttys[idx]) { tty_debug(tty, "bad driver table[%d] = %p\n", idx, tty->driver->ttys[idx]); return -1; } if (tty->driver->other) { struct tty_struct *o_tty = tty->link; if (o_tty != tty->driver->other->ttys[idx]) { tty_debug(tty, "bad other table[%d] = %p\n", idx, tty->driver->other->ttys[idx]); return -1; } if (o_tty->link != tty) { tty_debug(tty, "bad link = %p\n", o_tty->link); return -1; } } #endif return 0; } /** * tty_kclose - closes tty opened by tty_kopen * @tty: tty device * * Performs the final steps to release and free a tty device. It is the same as * tty_release_struct() except that it also resets %TTY_PORT_KOPENED flag on * @tty->port. */ void tty_kclose(struct tty_struct *tty) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); tty_port_set_kopened(tty->port, 0); release_tty(tty, tty->index); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_kclose); /** * tty_release_struct - release a tty struct * @tty: tty device * @idx: index of the tty * * Performs the final steps to release and free a tty device. It is roughly the * reverse of tty_init_dev(). */ void tty_release_struct(struct tty_struct *tty, int idx) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commmences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); release_tty(tty, idx); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_release_struct); /** * tty_release - vfs callback for close * @inode: inode of tty * @filp: file pointer for handle to tty * * Called the last time each file handle is closed that references this tty. * There may however be several such references. * * Locking: * Takes BKL. See tty_release_dev(). * * Even releasing the tty structures is a tricky business. We have to be very * careful that the structures are all released at the same time, as interrupts * might otherwise get the wrong pointers. * * WSH 09/09/97: rewritten to avoid some nasty race conditions that could * lead to double frees or releasing memory still in use. */ int tty_release(struct inode *inode, struct file *filp) { struct tty_struct *tty = file_tty(filp); struct tty_struct *o_tty = NULL; int do_sleep, final; int idx; long timeout = 0; int once = 1; if (tty_paranoia_check(tty, inode, __func__)) return 0; tty_lock(tty); check_tty_count(tty, __func__); __tty_fasync(-1, filp, 0); idx = tty->index; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) o_tty = tty->link; if (tty_release_checks(tty, idx)) { tty_unlock(tty); return 0; } tty_debug_hangup(tty, "releasing (count=%d)\n", tty->count); if (tty->ops->close) tty->ops->close(tty, filp); /* If tty is pty master, lock the slave pty (stable lock order) */ tty_lock_slave(o_tty); /* * Sanity check: if tty->count is going to zero, there shouldn't be * any waiters on tty->read_wait or tty->write_wait. We test the * wait queues and kick everyone out _before_ actually starting to * close. This ensures that we won't block while releasing the tty * structure. * * The test for the o_tty closing is necessary, since the master and * slave sides may close in any order. If the slave side closes out * first, its count will be one, since the master side holds an open. * Thus this test wouldn't be triggered at the time the slave closed, * so we do it now. */ while (1) { do_sleep = 0; if (tty->count <= 1) { if (waitqueue_active(&tty->read_wait)) { wake_up_poll(&tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&tty->write_wait)) { wake_up_poll(&tty->write_wait, EPOLLOUT); do_sleep++; } } if (o_tty && o_tty->count <= 1) { if (waitqueue_active(&o_tty->read_wait)) { wake_up_poll(&o_tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&o_tty->write_wait)) { wake_up_poll(&o_tty->write_wait, EPOLLOUT); do_sleep++; } } if (!do_sleep) break; if (once) { once = 0; tty_warn(tty, "read/write wait queue active!\n"); } schedule_timeout_killable(timeout); if (timeout < 120 * HZ) timeout = 2 * timeout + 1; else timeout = MAX_SCHEDULE_TIMEOUT; } if (o_tty) { if (--o_tty->count < 0) { tty_warn(tty, "bad slave count (%d)\n", o_tty->count); o_tty->count = 0; } } if (--tty->count < 0) { tty_warn(tty, "bad tty->count (%d)\n", tty->count); tty->count = 0; } /* * We've decremented tty->count, so we need to remove this file * descriptor off the tty->tty_files list; this serves two * purposes: * - check_tty_count sees the correct number of file descriptors * associated with this tty. * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ tty_del_file(filp); /* * Perform some housekeeping before deciding whether to return. * * If _either_ side is closing, make sure there aren't any * processes that still think tty or o_tty is their controlling * tty. */ if (!tty->count) { read_lock(&tasklist_lock); session_clear_tty(tty->ctrl.session); if (o_tty) session_clear_tty(o_tty->ctrl.session); read_unlock(&tasklist_lock); } /* check whether both sides are closing ... */ final = !tty->count && !(o_tty && o_tty->count); tty_unlock_slave(o_tty); tty_unlock(tty); /* At this point, the tty->count == 0 should ensure a dead tty * cannot be re-opened by a racing opener. */ if (!final) return 0; tty_debug_hangup(tty, "final close\n"); tty_release_struct(tty, idx); return 0; } /** * tty_open_current_tty - get locked tty of current task * @device: device number * @filp: file pointer to tty * @return: locked tty of the current task iff @device is /dev/tty * * Performs a re-open of the current task's controlling tty. * * We cannot return driver and index like for the other nodes because devpts * will not work then. It expects inodes to be from devpts FS. */ static struct tty_struct *tty_open_current_tty(dev_t device, struct file *filp) { struct tty_struct *tty; int retval; if (device != MKDEV(TTYAUX_MAJOR, 0)) return NULL; tty = get_current_tty(); if (!tty) return ERR_PTR(-ENXIO); filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ tty_lock(tty); tty_kref_put(tty); /* safe to drop the kref now */ retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } return tty; } /** * tty_lookup_driver - lookup a tty driver for a given device file * @device: device number * @filp: file pointer to tty * @index: index for the device in the @return driver * * If returned value is not erroneous, the caller is responsible to decrement * the refcount by tty_driver_kref_put(). * * Locking: %tty_mutex protects get_tty_driver() * * Return: driver for this inode (with increased refcount) */ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp, int *index) { struct tty_driver *driver = NULL; switch (device) { #ifdef CONFIG_VT case MKDEV(TTY_MAJOR, 0): { extern struct tty_driver *console_driver; driver = tty_driver_kref_get(console_driver); *index = fg_console; break; } #endif case MKDEV(TTYAUX_MAJOR, 1): { struct tty_driver *console_driver = console_device(index); if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver && filp) { /* Don't let /dev/console block */ filp->f_flags |= O_NONBLOCK; break; } } if (driver) tty_driver_kref_put(driver); return ERR_PTR(-ENODEV); } default: driver = get_tty_driver(device, index); if (!driver) return ERR_PTR(-ENODEV); break; } return driver; } static struct tty_struct *tty_kopen(dev_t device, int shared) { struct tty_struct *tty; struct tty_driver *driver; int index = -1; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, NULL, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, NULL, index); if (IS_ERR(tty) || shared) goto out; if (tty) { /* drop kref from tty_driver_lookup_tty() */ tty_kref_put(tty); tty = ERR_PTR(-EBUSY); } else { /* tty_init_dev returns tty with the tty_lock held */ tty = tty_init_dev(driver, index); if (IS_ERR(tty)) goto out; tty_port_set_kopened(tty->port, 1); } out: mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); return tty; } /** * tty_kopen_exclusive - open a tty device for kernel * @device: dev_t of device to open * * Opens tty exclusively for kernel. Performs the driver lookup, makes sure * it's not already opened and performs the first-time tty initialization. * * Claims the global %tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized &tty_struct */ struct tty_struct *tty_kopen_exclusive(dev_t device) { return tty_kopen(device, 0); } EXPORT_SYMBOL_GPL(tty_kopen_exclusive); /** * tty_kopen_shared - open a tty device for shared in-kernel use * @device: dev_t of device to open * * Opens an already existing tty for in-kernel use. Compared to * tty_kopen_exclusive() above it doesn't ensure to be the only user. * * Locking: identical to tty_kopen() above. */ struct tty_struct *tty_kopen_shared(dev_t device) { return tty_kopen(device, 1); } EXPORT_SYMBOL_GPL(tty_kopen_shared); /** * tty_open_by_driver - open a tty device * @device: dev_t of device to open * @filp: file pointer to tty * * Performs the driver lookup, checks for a reopen, or otherwise performs the * first-time tty initialization. * * * Claims the global tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized or re-opened &tty_struct */ static struct tty_struct *tty_open_by_driver(dev_t device, struct file *filp) { struct tty_struct *tty; struct tty_driver *driver = NULL; int index = -1; int retval; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, filp, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, filp, index); if (IS_ERR(tty)) { mutex_unlock(&tty_mutex); goto out; } if (tty) { if (tty_port_kopened(tty->port)) { tty_kref_put(tty); mutex_unlock(&tty_mutex); tty = ERR_PTR(-EBUSY); goto out; } mutex_unlock(&tty_mutex); retval = tty_lock_interruptible(tty); tty_kref_put(tty); /* drop kref from tty_driver_lookup_tty() */ if (retval) { if (retval == -EINTR) retval = -ERESTARTSYS; tty = ERR_PTR(retval); goto out; } retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } } else { /* Returns with the tty_lock held for now */ tty = tty_init_dev(driver, index); mutex_unlock(&tty_mutex); } out: tty_driver_kref_put(driver); return tty; } /** * tty_open - open a tty device * @inode: inode of device file * @filp: file pointer to tty * * tty_open() and tty_release() keep up the tty count that contains the number * of opens done on a tty. We cannot use the inode-count, as different inodes * might point to the same tty. * * Open-counting is needed for pty masters, as well as for keeping track of * serial lines: DTR is dropped when the last close happens. * (This is not done solely through tty->count, now. - Ted 1/27/92) * * The termios state of a pty is reset on the first open so that settings don't * persist across reuse. * * Locking: * * %tty_mutex protects tty, tty_lookup_driver() and tty_init_dev(). * * @tty->count should protect the rest. * * ->siglock protects ->signal/->sighand * * Note: the tty_unlock/lock cases without a ref are only safe due to %tty_mutex */ static int tty_open(struct inode *inode, struct file *filp) { struct tty_struct *tty; int noctty, retval; dev_t device = inode->i_rdev; unsigned saved_flags = filp->f_flags; nonseekable_open(inode, filp); retry_open: retval = tty_alloc_file(filp); if (retval) return -ENOMEM; tty = tty_open_current_tty(device, filp); if (!tty) tty = tty_open_by_driver(device, filp); if (IS_ERR(tty)) { tty_free_file(filp); retval = PTR_ERR(tty); if (retval != -EAGAIN || signal_pending(current)) return retval; schedule(); goto retry_open; } tty_add_file(tty, filp); check_tty_count(tty, __func__); tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); if (tty->ops->open) retval = tty->ops->open(tty, filp); else retval = -ENODEV; filp->f_flags = saved_flags; if (retval) { tty_debug_hangup(tty, "open error %d, releasing\n", retval); tty_unlock(tty); /* need to call tty_release without BTM */ tty_release(inode, filp); if (retval != -ERESTARTSYS) return retval; if (signal_pending(current)) return retval; schedule(); /* * Need to reset f_op in case a hangup happened. */ if (tty_hung_up_p(filp)) filp->f_op = &tty_fops; goto retry_open; } clear_bit(TTY_HUPPED, &tty->flags); noctty = (filp->f_flags & O_NOCTTY) || (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || device == MKDEV(TTYAUX_MAJOR, 1) || (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); if (!noctty) tty_open_proc_set_tty(filp, tty); tty_unlock(tty); return 0; } /** * tty_poll - check tty status * @filp: file being polled * @wait: poll wait structures to update * * Call the line discipline polling method to obtain the poll status of the * device. * * Locking: locks called line discipline but ldisc poll method may be * re-entered freely by other callers. */ static __poll_t tty_poll(struct file *filp, poll_table *wait) { struct tty_struct *tty = file_tty(filp); struct tty_ldisc *ld; __poll_t ret = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_poll")) return 0; ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_poll(filp, wait); if (ld->ops->poll) ret = ld->ops->poll(tty, filp, wait); tty_ldisc_deref(ld); return ret; } static int __tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); unsigned long flags; int retval = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; if (on) { retval = file_f_owner_allocate(filp); if (retval) goto out; } retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; if (on) { enum pid_type type; struct pid *pid; spin_lock_irqsave(&tty->ctrl.lock, flags); if (tty->ctrl.pgrp) { pid = tty->ctrl.pgrp; type = PIDTYPE_PGID; } else { pid = task_pid(current); type = PIDTYPE_TGID; } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl.lock, flags); __f_setown(filp, pid, type, 0); put_pid(pid); retval = 0; } out: return retval; } static int tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); int retval = -ENOTTY; tty_lock(tty); if (!tty_hung_up_p(filp)) retval = __tty_fasync(fd, filp, on); tty_unlock(tty); return retval; } static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI); /** * tiocsti - fake input character * @tty: tty to fake input into * @p: pointer to character * * Fake input to a tty device. Does the necessary locking and input management. * * FIXME: does not honour flow control ?? * * Locking: * * Called functions take tty_ldiscs_lock * * current->signal->tty check is safe without locks */ static int tiocsti(struct tty_struct *tty, u8 __user *p) { struct tty_ldisc *ld; u8 ch; if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) return -EFAULT; tty_audit_tiocsti(tty, ch); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, NULL, 1); tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; } /** * tiocgwinsz - implement window query ioctl * @tty: tty * @arg: user buffer for result * * Copies the kernel idea of the window size into the user buffer. * * Locking: @tty->winsize_mutex is taken to ensure the winsize data is * consistent. */ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg) { guard(mutex)(&tty->winsize_mutex); if (copy_to_user(arg, &tty->winsize, sizeof(*arg))) return -EFAULT; return 0; } /** * tty_do_resize - resize event * @tty: tty being resized * @ws: new dimensions * * Update the termios variables and send the necessary signals to peform a * terminal resize correctly. */ int tty_do_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp; guard(mutex)(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) return 0; /* Signal the foreground process group */ pgrp = tty_get_pgrp(tty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); put_pid(pgrp); tty->winsize = *ws; return 0; } EXPORT_SYMBOL(tty_do_resize); /** * tiocswinsz - implement window size set ioctl * @tty: tty side of tty * @arg: user buffer for result * * Copies the user idea of the window size to the kernel. Traditionally this is * just advisory information but for the Linux console it actually has driver * level meaning and triggers a VC resize. * * Locking: * Driver dependent. The default do_resize method takes the tty termios * mutex and ctrl.lock. The console takes its own lock then calls into the * default method. */ static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg) { struct winsize tmp_ws; if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) return -EFAULT; if (tty->ops->resize) return tty->ops->resize(tty, &tmp_ws); else return tty_do_resize(tty, &tmp_ws); } /** * tioccons - allow admin to move logical console * @file: the file to become console * * Allow the administrator to move the redirected console device. * * Locking: uses redirect_lock to guard the redirect information */ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (file->f_op->write_iter == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); f = redirect; redirect = NULL; spin_unlock(&redirect_lock); if (f) fput(f); return 0; } if (file->f_op->write_iter != tty_write) return -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; guard(spinlock)(&redirect_lock); if (redirect) return -EBUSY; redirect = get_file(file); return 0; } /** * tiocsetd - set line discipline * @tty: tty device * @p: pointer to user data * * Set the line discipline according to user request. * * Locking: see tty_set_ldisc(), this function is just a helper */ static int tiocsetd(struct tty_struct *tty, int __user *p) { int disc; int ret; if (get_user(disc, p)) return -EFAULT; ret = tty_set_ldisc(tty, disc); return ret; } /** * tiocgetd - get line discipline * @tty: tty device * @p: pointer to user data * * Retrieves the line discipline id directly from the ldisc. * * Locking: waits for ldisc reference (in case the line discipline is changing * or the @tty is being hungup) */ static int tiocgetd(struct tty_struct *tty, int __user *p) { struct tty_ldisc *ld; int ret; ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; ret = put_user(ld->ops->num, p); tty_ldisc_deref(ld); return ret; } /** * send_break - performed time break * @tty: device to break on * @duration: timeout in mS * * Perform a timed break on hardware that lacks its own driver level timed * break functionality. * * Locking: * @tty->atomic_write_lock serializes */ static int send_break(struct tty_struct *tty, unsigned int duration) { int retval; if (tty->ops->break_ctl == NULL) return 0; if (tty->driver->flags & TTY_DRIVER_HARDWARE_BREAK) return tty->ops->break_ctl(tty, duration); /* Do the work ourselves */ if (tty_write_lock(tty, false) < 0) return -EINTR; retval = tty->ops->break_ctl(tty, -1); if (!retval) { msleep_interruptible(duration); retval = tty->ops->break_ctl(tty, 0); } else if (retval == -EOPNOTSUPP) { /* some drivers can tell only dynamically */ retval = 0; } tty_write_unlock(tty); if (signal_pending(current)) retval = -EINTR; return retval; } /** * tty_get_tiocm - get tiocm status register * @tty: tty device * * Obtain the modem status bits from the tty driver if the feature * is supported. */ int tty_get_tiocm(struct tty_struct *tty) { int retval = -ENOTTY; if (tty->ops->tiocmget) retval = tty->ops->tiocmget(tty); return retval; } EXPORT_SYMBOL_GPL(tty_get_tiocm); /** * tty_tiocmget - get modem status * @tty: tty device * @p: pointer to result * * Obtain the modem status bits from the tty driver if the feature is * supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmget(struct tty_struct *tty, int __user *p) { int retval; retval = tty_get_tiocm(tty); if (retval >= 0) retval = put_user(retval, p); return retval; } /** * tty_tiocmset - set modem status * @tty: tty device * @cmd: command - clear bits, set bits or set all * @p: pointer to desired bits * * Set the modem status bits from the tty driver if the feature * is supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd, unsigned __user *p) { int retval; unsigned int set, clear, val; if (tty->ops->tiocmset == NULL) return -ENOTTY; retval = get_user(val, p); if (retval) return retval; set = clear = 0; switch (cmd) { case TIOCMBIS: set = val; break; case TIOCMBIC: clear = val; break; case TIOCMSET: set = val; clear = ~val; break; } set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; return tty->ops->tiocmset(tty, set, clear); } /** * tty_get_icount - get tty statistics * @tty: tty device * @icount: output parameter * * Gets a copy of the @tty's icount statistics. * * Locking: none (up to the driver) */ int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { memset(icount, 0, sizeof(*icount)); if (tty->ops->get_icount) return tty->ops->get_icount(tty, icount); else return -ENOTTY; } EXPORT_SYMBOL_GPL(tty_get_icount); static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) { struct serial_icounter_struct icount; int retval; retval = tty_get_icount(tty, &icount); if (retval != 0) return retval; if (copy_to_user(arg, &icount, sizeof(icount))) return -EFAULT; return 0; } static int tty_set_serial(struct tty_struct *tty, struct serial_struct *ss) { int flags; flags = ss->flags & ASYNC_DEPRECATED; if (flags) pr_warn_ratelimited("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", __func__, current->comm, flags); if (!tty->ops->set_serial) return -ENOTTY; return tty->ops->set_serial(tty, ss); } static int tty_tiocsserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; if (copy_from_user(&v, ss, sizeof(*ss))) return -EFAULT; return tty_set_serial(tty, &v); } static int tty_tiocgserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; int err; memset(&v, 0, sizeof(v)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err && copy_to_user(ss, &v, sizeof(v))) err = -EFAULT; return err; } /* * if pty, return the slave side (real_tty) * otherwise, return self */ static struct tty_struct *tty_pair_get_tty(struct tty_struct *tty) { if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) tty = tty->link; return tty; } /* * Split this up, as gcc can choke on it otherwise.. */ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_struct *real_tty; void __user *p = (void __user *)arg; int retval; struct tty_ldisc *ld; if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; real_tty = tty_pair_get_tty(tty); /* * Factor out some common prep work */ switch (cmd) { case TIOCSETD: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: retval = tty_check_change(tty); if (retval) return retval; if (cmd != TIOCCBRK) { tty_wait_until_sent(tty, 0); if (signal_pending(current)) return -EINTR; } break; } /* * Now do the stuff. */ switch (cmd) { case TIOCSTI: return tiocsti(tty, p); case TIOCGWINSZ: return tiocgwinsz(real_tty, p); case TIOCSWINSZ: return tiocswinsz(real_tty, p); case TIOCCONS: return real_tty != tty ? -EINVAL : tioccons(file); case TIOCEXCL: set_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNXCL: clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCGEXCL: { int excl = test_bit(TTY_EXCLUSIVE, &tty->flags); return put_user(excl, (int __user *)p); } case TIOCGETD: return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: if (!capable(CAP_SYS_ADMIN)) return -EPERM; tty_vhangup(tty); return 0; case TIOCGDEV: { unsigned int ret = new_encode_dev(tty_devnum(real_tty)); return put_user(ret, (unsigned int __user *)p); } /* * Break handling */ case TIOCSBRK: /* Turn break on, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, -1); return 0; case TIOCCBRK: /* Turn break off, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, 0); return 0; case TCSBRK: /* SVID version: non-zero arg --> no break */ /* non-zero arg means wait for all output data * to be sent (performed above) but don't send break. * This is used by the tcdrain() termios function. */ if (!arg) return send_break(tty, 250); return 0; case TCSBRKP: /* support for POSIX tcsendbreak() */ return send_break(tty, arg ? arg*100 : 250); case TIOCMGET: return tty_tiocmget(tty, p); case TIOCMSET: case TIOCMBIC: case TIOCMBIS: return tty_tiocmset(tty, cmd, p); case TIOCGICOUNT: return tty_tiocgicount(tty, p); case TCFLSH: switch (arg) { case TCIFLUSH: case TCIOFLUSH: /* flush tty buffer and allow ldisc to process ioctl */ tty_buffer_flush(tty, NULL); break; } break; case TIOCSSERIAL: return tty_tiocsserial(tty, p); case TIOCGSERIAL: return tty_tiocgserial(tty, p); case TIOCGPTPEER: /* Special because the struct file is needed */ return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_ioctl(file, cmd, arg); retval = -EINVAL; if (ld->ops->ioctl) { retval = ld->ops->ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD) retval = -ENOTTY; } tty_ldisc_deref(ld); return retval; } #ifdef CONFIG_COMPAT struct serial_struct32 { compat_int_t type; compat_int_t line; compat_uint_t port; compat_int_t irq; compat_int_t flags; compat_int_t xmit_fifo_size; compat_int_t custom_divisor; compat_int_t baud_base; unsigned short close_delay; char io_type; char reserved_char; compat_int_t hub6; unsigned short closing_wait; /* time to wait before closing */ unsigned short closing_wait2; /* no longer used... */ compat_uint_t iomem_base; unsigned short iomem_reg_shift; unsigned int port_high; /* compat_ulong_t iomap_base FIXME */ compat_int_t reserved; }; static int compat_tty_tiocsserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; if (copy_from_user(&v32, ss, sizeof(*ss))) return -EFAULT; memcpy(&v, &v32, offsetof(struct serial_struct32, iomem_base)); v.iomem_base = compat_ptr(v32.iomem_base); v.iomem_reg_shift = v32.iomem_reg_shift; v.port_high = v32.port_high; v.iomap_base = 0; return tty_set_serial(tty, &v); } static int compat_tty_tiocgserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; int err; memset(&v, 0, sizeof(v)); memset(&v32, 0, sizeof(v32)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err) { memcpy(&v32, &v, offsetof(struct serial_struct32, iomem_base)); v32.iomem_base = (unsigned long)v.iomem_base >> 32 ? 0xfffffff : ptr_to_compat(v.iomem_base); v32.iomem_reg_shift = v.iomem_reg_shift; v32.port_high = v.port_high; if (copy_to_user(ss, &v32, sizeof(v32))) err = -EFAULT; } return err; } static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; int retval = -ENOIOCTLCMD; switch (cmd) { case TIOCOUTQ: case TIOCSTI: case TIOCGWINSZ: case TIOCSWINSZ: case TIOCGEXCL: case TIOCGETD: case TIOCSETD: case TIOCGDEV: case TIOCMGET: case TIOCMSET: case TIOCMBIC: case TIOCMBIS: case TIOCGICOUNT: case TIOCGPGRP: case TIOCSPGRP: case TIOCGSID: case TIOCSERGETLSR: case TIOCGRS485: case TIOCSRS485: #ifdef TIOCGETP case TIOCGETP: case TIOCSETP: case TIOCSETN: #endif #ifdef TIOCGETC case TIOCGETC: case TIOCSETC: #endif #ifdef TIOCGLTC case TIOCGLTC: case TIOCSLTC: #endif case TCSETSF: case TCSETSW: case TCSETS: case TCGETS: #ifdef TCGETS2 case TCGETS2: case TCSETSF2: case TCSETSW2: case TCSETS2: #endif case TCGETA: case TCSETAF: case TCSETAW: case TCSETA: case TIOCGLCKTRMIOS: case TIOCSLCKTRMIOS: #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: #endif case TIOCGSOFTCAR: case TIOCSSOFTCAR: case PPPIOCGCHAN: case PPPIOCGUNIT: return tty_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); case TIOCCONS: case TIOCEXCL: case TIOCNXCL: case TIOCVHANGUP: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: case TCFLSH: case TIOCGPTPEER: case TIOCNOTTY: case TIOCSCTTY: case TCXONC: case TIOCMIWAIT: case TIOCSERCONFIG: return tty_ioctl(file, cmd, arg); } if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; switch (cmd) { case TIOCSSERIAL: return compat_tty_tiocsserial(tty, compat_ptr(arg)); case TIOCGSERIAL: return compat_tty_tiocgserial(tty, compat_ptr(arg)); } if (tty->ops->compat_ioctl) { retval = tty->ops->compat_ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_compat_ioctl(file, cmd, arg); if (ld->ops->compat_ioctl) retval = ld->ops->compat_ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD && ld->ops->ioctl) retval = ld->ops->ioctl(tty, (unsigned long)compat_ptr(cmd), arg); tty_ldisc_deref(ld); return retval; } #endif static int this_tty(const void *t, struct file *file, unsigned fd) { if (likely(file->f_op->read_iter != tty_read)) return 0; return file_tty(file) != t ? 0 : fd + 1; } /* * This implements the "Secure Attention Key" --- the idea is to * prevent trojan horses by killing all processes associated with this * tty when the user hits the "Secure Attention Key". Required for * super-paranoid applications --- see the Orange Book for more details. * * This code could be nicer; ideally it should send a HUP, wait a few * seconds, then send a INT, and then a KILL signal. But you then * have to coordinate with the init process, since all processes associated * with the current tty must be dead before the new getty is allowed * to spawn. * * Now, if it would be correct ;-/ The current code has a nasty hole - * it doesn't catch files in flight. We may send the descriptor to ourselves * via AF_UNIX socket, close it and later fetch from socket. FIXME. * * Nasty bug: do_SAK is being called in interrupt context. This can * deadlock. We punt it up to process context. AKPM - 16Mar2001 */ void __do_SAK(struct tty_struct *tty) { struct task_struct *g, *p; struct pid *session; int i; scoped_guard(spinlock_irqsave, &tty->ctrl.lock) session = get_pid(tty->ctrl.session); tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); read_lock(&tasklist_lock); /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { tty_notice(tty, "SAK: killed process %d (%s): by session\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the tty open */ for_each_process_thread(g, p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); continue; } guard(task_lock)(p); i = iterate_fd(p->files, 0, this_tty, tty); if (i != 0) { tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n", task_pid_nr(p), p->comm, i - 1); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } } read_unlock(&tasklist_lock); put_pid(session); } static void do_SAK_work(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, SAK_work); __do_SAK(tty); } /* * The tq handling here is a little racy - tty->SAK_work may already be queued. * Fortunately we don't need to worry, because if ->SAK_work is already queued, * the values which we write to it will be identical to the values which it * already has. --akpm */ void do_SAK(struct tty_struct *tty) { if (!tty) return; schedule_work(&tty->SAK_work); } EXPORT_SYMBOL(do_SAK); /* Must put_device() after it's unused! */ static struct device *tty_get_device(struct tty_struct *tty) { dev_t devt = tty_devnum(tty); return class_find_device_by_devt(&tty_class, devt); } /** * alloc_tty_struct - allocate a new tty * @driver: driver which will handle the returned tty * @idx: minor of the tty * * This subroutine allocates and initializes a tty structure. * * Locking: none - @tty in question is not exposed at this point */ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) { struct tty_struct *tty; tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT); if (!tty) return NULL; kref_init(&tty->kref); if (tty_ldisc_init(tty)) { kfree(tty); return NULL; } tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; mutex_init(&tty->legacy_mutex); mutex_init(&tty->throttle_mutex); init_rwsem(&tty->termios_rwsem); mutex_init(&tty->winsize_mutex); init_ldsem(&tty->ldisc_sem); init_waitqueue_head(&tty->write_wait); init_waitqueue_head(&tty->read_wait); INIT_WORK(&tty->hangup_work, do_tty_hangup); mutex_init(&tty->atomic_write_lock); spin_lock_init(&tty->ctrl.lock); spin_lock_init(&tty->flow.lock); spin_lock_init(&tty->files_lock); INIT_LIST_HEAD(&tty->tty_files); INIT_WORK(&tty->SAK_work, do_SAK_work); tty->driver = driver; tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); return tty; } /** * tty_put_char - write one character to a tty * @tty: tty * @ch: character to write * * Write one byte to the @tty using the provided @tty->ops->put_char() method * if present. * * Note: the specific put_char operation in the driver layer may go * away soon. Don't call it directly, use this method * * Return: the number of characters successfully output. */ int tty_put_char(struct tty_struct *tty, u8 ch) { if (tty->ops->put_char) return tty->ops->put_char(tty, ch); return tty->ops->write(tty, &ch, 1); } EXPORT_SYMBOL_GPL(tty_put_char); static int tty_cdev_add(struct tty_driver *driver, dev_t dev, unsigned int index, unsigned int count) { int err; /* init here, since reused cdevs cause crashes */ driver->cdevs[index] = cdev_alloc(); if (!driver->cdevs[index]) return -ENOMEM; driver->cdevs[index]->ops = &tty_fops; driver->cdevs[index]->owner = driver->owner; err = cdev_add(driver->cdevs[index], dev, count); if (err) kobject_put(&driver->cdevs[index]->kobj); return err; } /** * tty_register_device - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to NULL safely. * * This call is required to be made to register an individual tty device * if the tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If * that bit is not set, this function should not be called by a tty * driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *device) { return tty_register_device_attr(driver, index, device, NULL, NULL); } EXPORT_SYMBOL(tty_register_device); static void tty_device_create_release(struct device *dev) { dev_dbg(dev, "releasing...\n"); kfree(dev); } /** * tty_register_device_attr - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to %NULL safely. * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * This call is required to be made to register an individual tty device if the * tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If that bit is * not set, this function should not be called by a tty driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { char name[64]; dev_t devt = MKDEV(driver->major, driver->minor_start) + index; struct ktermios *tp; struct device *dev; int retval; if (index >= driver->num) { pr_err("%s: Attempt to register invalid tty line number (%d)\n", driver->name, index); return ERR_PTR(-EINVAL); } if (driver->type == TTY_DRIVER_TYPE_PTY) pty_line_name(driver, index, name); else tty_line_name(driver, index, name); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); dev->devt = devt; dev->class = &tty_class; dev->parent = device; dev->release = tty_device_create_release; dev_set_name(dev, "%s", name); dev->groups = attr_grp; dev_set_drvdata(dev, drvdata); dev_set_uevent_suppress(dev, 1); retval = device_register(dev); if (retval) goto err_put; if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { /* * Free any saved termios data so that the termios state is * reset when reusing a minor number. */ tp = driver->termios[index]; if (tp) { driver->termios[index] = NULL; kfree(tp); } retval = tty_cdev_add(driver, devt, index, 1); if (retval) goto err_del; } dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); return dev; err_del: device_del(dev); err_put: put_device(dev); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(tty_register_device_attr); /** * tty_unregister_device - unregister a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * * If a tty device is registered with a call to tty_register_device() then * this function must be called when the tty device is gone. * * Locking: ?? */ void tty_unregister_device(struct tty_driver *driver, unsigned index) { device_destroy(&tty_class, MKDEV(driver->major, driver->minor_start) + index); if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { cdev_del(driver->cdevs[index]); driver->cdevs[index] = NULL; } } EXPORT_SYMBOL(tty_unregister_device); /** * __tty_alloc_driver - allocate tty driver * @lines: count of lines this driver can handle at most * @owner: module which is responsible for this driver * @flags: some of enum tty_driver_flag, will be set in driver->flags * * This should not be called directly, tty_alloc_driver() should be used * instead. * * Returns: struct tty_driver or a PTR-encoded error (use IS_ERR() and friends). */ struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags) { struct tty_driver *driver; unsigned int cdevs = 1; int err; if (!lines || (flags & TTY_DRIVER_UNNUMBERED_NODE && lines > 1)) return ERR_PTR(-EINVAL); driver = kzalloc(sizeof(*driver), GFP_KERNEL); if (!driver) return ERR_PTR(-ENOMEM); kref_init(&driver->kref); driver->num = lines; driver->owner = owner; driver->flags = flags; if (!(flags & TTY_DRIVER_DEVPTS_MEM)) { driver->ttys = kcalloc(lines, sizeof(*driver->ttys), GFP_KERNEL); driver->termios = kcalloc(lines, sizeof(*driver->termios), GFP_KERNEL); if (!driver->ttys || !driver->termios) { err = -ENOMEM; goto err_free_all; } } if (!(flags & TTY_DRIVER_DYNAMIC_ALLOC)) { driver->ports = kcalloc(lines, sizeof(*driver->ports), GFP_KERNEL); if (!driver->ports) { err = -ENOMEM; goto err_free_all; } cdevs = lines; } driver->cdevs = kcalloc(cdevs, sizeof(*driver->cdevs), GFP_KERNEL); if (!driver->cdevs) { err = -ENOMEM; goto err_free_all; } return driver; err_free_all: kfree(driver->ports); kfree(driver->ttys); kfree(driver->termios); kfree(driver->cdevs); kfree(driver); return ERR_PTR(err); } EXPORT_SYMBOL(__tty_alloc_driver); static void destruct_tty_driver(struct kref *kref) { struct tty_driver *driver = container_of(kref, struct tty_driver, kref); int i; struct ktermios *tp; if (driver->flags & TTY_DRIVER_INSTALLED) { for (i = 0; i < driver->num; i++) { tp = driver->termios[i]; if (tp) { driver->termios[i] = NULL; kfree(tp); } if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) tty_unregister_device(driver, i); } proc_tty_unregister_driver(driver); if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) cdev_del(driver->cdevs[0]); } kfree(driver->cdevs); kfree(driver->ports); kfree(driver->termios); kfree(driver->ttys); kfree(driver); } /** * tty_driver_kref_put - drop a reference to a tty driver * @driver: driver of which to drop the reference * * The final put will destroy and free up the driver. */ void tty_driver_kref_put(struct tty_driver *driver) { kref_put(&driver->kref, destruct_tty_driver); } EXPORT_SYMBOL(tty_driver_kref_put); /** * tty_register_driver - register a tty driver * @driver: driver to register * * Called by a tty driver to register itself. */ int tty_register_driver(struct tty_driver *driver) { int error; int i; dev_t dev; struct device *d; if (!driver->major) { error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, driver->name); if (!error) { driver->major = MAJOR(dev); driver->minor_start = MINOR(dev); } } else { dev = MKDEV(driver->major, driver->minor_start); error = register_chrdev_region(dev, driver->num, driver->name); } if (error < 0) goto err; if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) goto err_unreg_char; } scoped_guard(mutex, &tty_mutex) list_add(&driver->tty_drivers, &tty_drivers); if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) { for (i = 0; i < driver->num; i++) { d = tty_register_device(driver, i, NULL); if (IS_ERR(d)) { error = PTR_ERR(d); goto err_unreg_devs; } } } proc_tty_register_driver(driver); driver->flags |= TTY_DRIVER_INSTALLED; return 0; err_unreg_devs: for (i--; i >= 0; i--) tty_unregister_device(driver, i); scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); err_unreg_char: unregister_chrdev_region(dev, driver->num); err: return error; } EXPORT_SYMBOL(tty_register_driver); /** * tty_unregister_driver - unregister a tty driver * @driver: driver to unregister * * Called by a tty driver to unregister itself. */ void tty_unregister_driver(struct tty_driver *driver) { unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), driver->num); scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); } EXPORT_SYMBOL(tty_unregister_driver); dev_t tty_devnum(struct tty_struct *tty) { return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; } EXPORT_SYMBOL(tty_devnum); void tty_default_fops(struct file_operations *fops) { *fops = tty_fops; } static char *tty_devnode(const struct device *dev, umode_t *mode) { if (!mode) return NULL; if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) || dev->devt == MKDEV(TTYAUX_MAJOR, 2)) *mode = 0666; return NULL; } const struct class tty_class = { .name = "tty", .devnode = tty_devnode, }; static int __init tty_class_init(void) { return class_register(&tty_class); } postcore_initcall(tty_class_init); /* 3/2004 jmc: why do these devices exist? */ static struct cdev tty_cdev, console_cdev; static ssize_t show_cons_active(struct device *dev, struct device_attribute *attr, char *buf) { struct console *cs[16]; int i = 0; struct console *c; ssize_t count = 0; /* * Hold the console_list_lock to guarantee that no consoles are * unregistered until all console processing is complete. * This also allows safe traversal of the console list and * race-free reading of @flags. */ console_list_lock(); for_each_console(c) { if (!c->device) continue; if (!(c->flags & CON_NBCON) && !c->write) continue; if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; if (i >= ARRAY_SIZE(cs)) break; } /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); while (i--) { int index = cs[i]->index; struct tty_driver *drv = cs[i]->device(cs[i], &index); /* don't resolve tty0 as some programs depend on it */ if (drv && (cs[i]->index > 0 || drv->major != TTY_MAJOR)) count += tty_line_name(drv, index, buf + count); else count += sprintf(buf + count, "%s%d", cs[i]->name, cs[i]->index); count += sprintf(buf + count, "%c", i ? ' ':'\n'); } console_unlock(); console_list_unlock(); return count; } static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL); static struct attribute *cons_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(cons_dev); static struct device *consdev; void console_sysfs_notify(void) { if (consdev) sysfs_notify(&consdev->kobj, NULL, "active"); } static const struct ctl_table tty_table[] = { { .procname = "legacy_tiocsti", .data = &tty_legacy_tiocsti, .maxlen = sizeof(tty_legacy_tiocsti), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "ldisc_autoload", .data = &tty_ldisc_autoload, .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; /* * Ok, now we can initialize the rest of the tty devices and can count * on memory allocations, interrupts etc.. */ int __init tty_init(void) { register_sysctl_init("dev/tty", tty_table); cdev_init(&tty_cdev, &tty_fops); if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) panic("Couldn't register /dev/tty driver\n"); device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); cdev_init(&console_cdev, &console_fops); if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) panic("Couldn't register /dev/console driver\n"); consdev = device_create_with_groups(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 1), NULL, cons_dev_groups, "console"); if (IS_ERR(consdev)) consdev = NULL; #ifdef CONFIG_VT vty_init(&console_fops); #endif return 0; }
147 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* X.509 certificate parser internal definitions * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/cleanup.h> #include <linux/time.h> #include <crypto/public_key.h> #include <keys/asymmetric-type.h> struct x509_certificate { struct x509_certificate *next; struct x509_certificate *signer; /* Certificate that signed this one */ struct public_key *pub; /* Public key details */ struct public_key_signature *sig; /* Signature parameters */ char *issuer; /* Name of certificate issuer */ char *subject; /* Name of certificate subject */ struct asymmetric_key_id *id; /* Issuer + Serial number */ struct asymmetric_key_id *skid; /* Subject + subjectKeyId (optional) */ time64_t valid_from; time64_t valid_to; const void *tbs; /* Signed data */ unsigned tbs_size; /* Size of signed data */ unsigned raw_sig_size; /* Size of signature */ const void *raw_sig; /* Signature data */ const void *raw_serial; /* Raw serial number in ASN.1 */ unsigned raw_serial_size; unsigned raw_issuer_size; const void *raw_issuer; /* Raw issuer name in ASN.1 */ const void *raw_subject; /* Raw subject name in ASN.1 */ unsigned raw_subject_size; unsigned raw_skid_size; const void *raw_skid; /* Raw subjectKeyId in ASN.1 */ unsigned index; bool seen; /* Infinite recursion prevention */ bool verified; bool self_signed; /* T if self-signed (check unsupported_sig too) */ bool unsupported_sig; /* T if signature uses unsupported crypto */ bool blacklisted; }; /* * x509_cert_parser.c */ extern void x509_free_certificate(struct x509_certificate *cert); DEFINE_FREE(x509_free_certificate, struct x509_certificate *, if (!IS_ERR(_T)) x509_free_certificate(_T)) extern struct x509_certificate *x509_cert_parse(const void *data, size_t datalen); extern int x509_decode_time(time64_t *_t, size_t hdrlen, unsigned char tag, const unsigned char *value, size_t vlen); /* * x509_public_key.c */ extern int x509_get_sig_params(struct x509_certificate *cert); extern int x509_check_for_self_signed(struct x509_certificate *cert);
7 501 3 3 495 3 3 22 431 4 4 4 4 4 4 882 859 29 114 3 110 1 860 367 558 368 52 48 48 50 52 108 42 42 159 159 491 3 478 22 159 412 2 411 22 22 15 22 7 15 172 4 10 158 1 144 22 62 98 168 289 65 288 288 142 29 2 27 1 1 28 16 246 73 175 212 38 198 49 246 104 142 1 245 345 346 269 77 107 108 48 346 345 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/errno.h> #include <linux/kmod.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/ratelimit.h> #include "tty.h" #undef LDISC_DEBUG_HANGUP #ifdef LDISC_DEBUG_HANGUP #define tty_ldisc_debug(tty, f, args...) tty_debug(tty, f, ##args) #else #define tty_ldisc_debug(tty, f, args...) #endif /* lockdep nested classes for tty->ldisc_sem */ enum { LDISC_SEM_NORMAL, LDISC_SEM_OTHER, }; /* * This guards the refcounted line discipline lists. The lock * must be taken with irqs off because there are hangup path * callers who will do ldisc lookups and cannot sleep. */ static DEFINE_RAW_SPINLOCK(tty_ldiscs_lock); /* Line disc dispatch table */ static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; /** * tty_register_ldisc - install a line discipline * @new_ldisc: pointer to the ldisc object * * Installs a new line discipline into the kernel. The discipline is set up as * unreferenced and then made available to the kernel from this point onwards. * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc) { unsigned long flags; if (new_ldisc->num < N_TTY || new_ldisc->num >= NR_LDISCS) return -EINVAL; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); tty_ldiscs[new_ldisc->num] = new_ldisc; raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); return 0; } EXPORT_SYMBOL(tty_register_ldisc); /** * tty_unregister_ldisc - unload a line discipline * @ldisc: ldisc number * * Remove a line discipline from the kernel providing it is not currently in * use. * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc) { unsigned long flags; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); tty_ldiscs[ldisc->num] = NULL; raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); } EXPORT_SYMBOL(tty_unregister_ldisc); static struct tty_ldisc_ops *get_ldops(int disc) { unsigned long flags; struct tty_ldisc_ops *ldops, *ret; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); ret = ERR_PTR(-EINVAL); ldops = tty_ldiscs[disc]; if (ldops) { ret = ERR_PTR(-EAGAIN); if (try_module_get(ldops->owner)) ret = ldops; } raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); return ret; } static void put_ldops(struct tty_ldisc_ops *ldops) { unsigned long flags; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); module_put(ldops->owner); raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); } int tty_ldisc_autoload = IS_BUILTIN(CONFIG_LDISC_AUTOLOAD); /** * tty_ldisc_get - take a reference to an ldisc * @tty: tty device * @disc: ldisc number * * Takes a reference to a line discipline. Deals with refcounts and module * locking counts. If the discipline is not available, its module loaded, if * possible. * * Returns: * * -%EINVAL if the discipline index is not [%N_TTY .. %NR_LDISCS] or if the * discipline is not registered * * -%EAGAIN if request_module() failed to load or register the discipline * * -%ENOMEM if allocation failure * * Otherwise, returns a pointer to the discipline and bumps the ref count * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ static struct tty_ldisc *tty_ldisc_get(struct tty_struct *tty, int disc) { struct tty_ldisc *ld; struct tty_ldisc_ops *ldops; if (disc < N_TTY || disc >= NR_LDISCS) return ERR_PTR(-EINVAL); /* * Get the ldisc ops - we may need to request them to be loaded * dynamically and try again. */ ldops = get_ldops(disc); if (IS_ERR(ldops)) { if (!capable(CAP_SYS_MODULE) && !tty_ldisc_autoload) return ERR_PTR(-EPERM); request_module("tty-ldisc-%d", disc); ldops = get_ldops(disc); if (IS_ERR(ldops)) return ERR_CAST(ldops); } /* * There is no way to handle allocation failure of only 16 bytes. * Let's simplify error handling and save more memory. */ ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL | __GFP_NOFAIL); ld->ops = ldops; ld->tty = tty; return ld; } /** * tty_ldisc_put - release the ldisc * @ld: lisdsc to release * * Complement of tty_ldisc_get(). */ static void tty_ldisc_put(struct tty_ldisc *ld) { if (WARN_ON_ONCE(!ld)) return; put_ldops(ld->ops); kfree(ld); } static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos) { return (*pos < NR_LDISCS) ? pos : NULL; } static void *tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return (*pos < NR_LDISCS) ? pos : NULL; } static void tty_ldiscs_seq_stop(struct seq_file *m, void *v) { } static int tty_ldiscs_seq_show(struct seq_file *m, void *v) { int i = *(loff_t *)v; struct tty_ldisc_ops *ldops; ldops = get_ldops(i); if (IS_ERR(ldops)) return 0; seq_printf(m, "%-10s %2d\n", ldops->name ? ldops->name : "???", i); put_ldops(ldops); return 0; } const struct seq_operations tty_ldiscs_seq_ops = { .start = tty_ldiscs_seq_start, .next = tty_ldiscs_seq_next, .stop = tty_ldiscs_seq_stop, .show = tty_ldiscs_seq_show, }; /** * tty_ldisc_ref_wait - wait for the tty ldisc * @tty: tty device * * Dereference the line discipline for the terminal and take a reference to it. * If the line discipline is in flux then wait patiently until it changes. * * Returns: %NULL if the tty has been hungup and not re-opened with a new file * descriptor, otherwise valid ldisc reference * * Note 1: Must not be called from an IRQ/timer context. The caller must also * be careful not to hold other locks that will deadlock against a discipline * change, such as an existing ldisc reference (which we check for). * * Note 2: a file_operations routine (read/poll/write) should use this function * to wait for any ldisc lifetime events to finish. */ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) { struct tty_ldisc *ld; ldsem_down_read(&tty->ldisc_sem, MAX_SCHEDULE_TIMEOUT); ld = tty->ldisc; if (!ld) ldsem_up_read(&tty->ldisc_sem); return ld; } EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); /** * tty_ldisc_ref - get the tty ldisc * @tty: tty device * * Dereference the line discipline for the terminal and take a reference to it. * If the line discipline is in flux then return %NULL. Can be called from IRQ * and timer functions. */ struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) { struct tty_ldisc *ld = NULL; if (ldsem_down_read_trylock(&tty->ldisc_sem)) { ld = tty->ldisc; if (!ld) ldsem_up_read(&tty->ldisc_sem); } return ld; } EXPORT_SYMBOL_GPL(tty_ldisc_ref); /** * tty_ldisc_deref - free a tty ldisc reference * @ld: reference to free up * * Undoes the effect of tty_ldisc_ref() or tty_ldisc_ref_wait(). May be called * in IRQ context. */ void tty_ldisc_deref(struct tty_ldisc *ld) { ldsem_up_read(&ld->tty->ldisc_sem); } EXPORT_SYMBOL_GPL(tty_ldisc_deref); static inline int __tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout) { return ldsem_down_write(&tty->ldisc_sem, timeout); } static inline int __tty_ldisc_lock_nested(struct tty_struct *tty, unsigned long timeout) { return ldsem_down_write_nested(&tty->ldisc_sem, LDISC_SEM_OTHER, timeout); } static inline void __tty_ldisc_unlock(struct tty_struct *tty) { ldsem_up_write(&tty->ldisc_sem); } int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout) { int ret; /* Kindly asking blocked readers to release the read side */ set_bit(TTY_LDISC_CHANGING, &tty->flags); wake_up_interruptible_all(&tty->read_wait); wake_up_interruptible_all(&tty->write_wait); ret = __tty_ldisc_lock(tty, timeout); if (!ret) return -EBUSY; set_bit(TTY_LDISC_HALTED, &tty->flags); return 0; } void tty_ldisc_unlock(struct tty_struct *tty) { clear_bit(TTY_LDISC_HALTED, &tty->flags); /* Can be cleared here - ldisc_unlock will wake up writers firstly */ clear_bit(TTY_LDISC_CHANGING, &tty->flags); __tty_ldisc_unlock(tty); } static int tty_ldisc_lock_pair_timeout(struct tty_struct *tty, struct tty_struct *tty2, unsigned long timeout) { int ret; if (tty < tty2) { ret = __tty_ldisc_lock(tty, timeout); if (ret) { ret = __tty_ldisc_lock_nested(tty2, timeout); if (!ret) __tty_ldisc_unlock(tty); } } else { /* if this is possible, it has lots of implications */ WARN_ON_ONCE(tty == tty2); if (tty2 && tty != tty2) { ret = __tty_ldisc_lock(tty2, timeout); if (ret) { ret = __tty_ldisc_lock_nested(tty, timeout); if (!ret) __tty_ldisc_unlock(tty2); } } else ret = __tty_ldisc_lock(tty, timeout); } if (!ret) return -EBUSY; set_bit(TTY_LDISC_HALTED, &tty->flags); if (tty2) set_bit(TTY_LDISC_HALTED, &tty2->flags); return 0; } static void tty_ldisc_lock_pair(struct tty_struct *tty, struct tty_struct *tty2) { tty_ldisc_lock_pair_timeout(tty, tty2, MAX_SCHEDULE_TIMEOUT); } static void tty_ldisc_unlock_pair(struct tty_struct *tty, struct tty_struct *tty2) { __tty_ldisc_unlock(tty); if (tty2) __tty_ldisc_unlock(tty2); } /** * tty_ldisc_flush - flush line discipline queue * @tty: tty to flush ldisc for * * Flush the line discipline queue (if any) and the tty flip buffers for this * @tty. */ void tty_ldisc_flush(struct tty_struct *tty) { struct tty_ldisc *ld = tty_ldisc_ref(tty); tty_buffer_flush(tty, ld); if (ld) tty_ldisc_deref(ld); } EXPORT_SYMBOL_GPL(tty_ldisc_flush); /** * tty_set_termios_ldisc - set ldisc field * @tty: tty structure * @disc: line discipline number * * This is probably overkill for real world processors but they are not on hot * paths so a little discipline won't do any harm. * * The line discipline-related tty_struct fields are reset to prevent the ldisc * driver from re-using stale information for the new ldisc instance. * * Locking: takes termios_rwsem */ static void tty_set_termios_ldisc(struct tty_struct *tty, int disc) { down_write(&tty->termios_rwsem); tty->termios.c_line = disc; up_write(&tty->termios_rwsem); tty->disc_data = NULL; tty->receive_room = 0; } /** * tty_ldisc_open - open a line discipline * @tty: tty we are opening the ldisc on * @ld: discipline to open * * A helper opening method. Also a convenient debugging and check point. * * Locking: always called with BTM already held. */ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld) { WARN_ON(test_and_set_bit(TTY_LDISC_OPEN, &tty->flags)); if (ld->ops->open) { int ret; /* BTM here locks versus a hangup event */ ret = ld->ops->open(tty); if (ret) clear_bit(TTY_LDISC_OPEN, &tty->flags); tty_ldisc_debug(tty, "%p: opened\n", ld); return ret; } return 0; } /** * tty_ldisc_close - close a line discipline * @tty: tty we are opening the ldisc on * @ld: discipline to close * * A helper close method. Also a convenient debugging and check point. */ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld) { lockdep_assert_held_write(&tty->ldisc_sem); WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags)); clear_bit(TTY_LDISC_OPEN, &tty->flags); if (ld->ops->close) ld->ops->close(tty); tty_ldisc_debug(tty, "%p: closed\n", ld); } /** * tty_ldisc_failto - helper for ldisc failback * @tty: tty to open the ldisc on * @ld: ldisc we are trying to fail back to * * Helper to try and recover a tty when switching back to the old ldisc fails * and we need something attached. */ static int tty_ldisc_failto(struct tty_struct *tty, int ld) { struct tty_ldisc *disc = tty_ldisc_get(tty, ld); int r; lockdep_assert_held_write(&tty->ldisc_sem); if (IS_ERR(disc)) return PTR_ERR(disc); tty->ldisc = disc; tty_set_termios_ldisc(tty, ld); r = tty_ldisc_open(tty, disc); if (r < 0) tty_ldisc_put(disc); return r; } /** * tty_ldisc_restore - helper for tty ldisc change * @tty: tty to recover * @old: previous ldisc * * Restore the previous line discipline or %N_TTY when a line discipline change * fails due to an open error */ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old) { /* There is an outstanding reference here so this is safe */ if (tty_ldisc_failto(tty, old->ops->num) < 0) { const char *name = tty_name(tty); pr_warn("Falling back ldisc for %s.\n", name); /* * The traditional behaviour is to fall back to N_TTY, we * want to avoid falling back to N_NULL unless we have no * choice to avoid the risk of breaking anything */ if (tty_ldisc_failto(tty, N_TTY) < 0 && tty_ldisc_failto(tty, N_NULL) < 0) panic("Couldn't open N_NULL ldisc for %s.", name); } } /** * tty_set_ldisc - set line discipline * @tty: the terminal to set * @disc: the line discipline number * * Set the discipline of a tty line. Must be called from a process context. The * ldisc change logic has to protect itself against any overlapping ldisc * change (including on the other end of pty pairs), the close of one side of a * tty/pty pair, and eventually hangup. */ int tty_set_ldisc(struct tty_struct *tty, int disc) { int retval; struct tty_ldisc *old_ldisc, *new_ldisc; new_ldisc = tty_ldisc_get(tty, disc); if (IS_ERR(new_ldisc)) return PTR_ERR(new_ldisc); tty_lock(tty); retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err; if (!tty->ldisc) { retval = -EIO; goto out; } /* Check the no-op case */ if (tty->ldisc->ops->num == disc) goto out; if (test_bit(TTY_HUPPED, &tty->flags)) { /* We were raced by hangup */ retval = -EIO; goto out; } if (tty->ops->ldisc_ok) { retval = tty->ops->ldisc_ok(tty, disc); if (retval) goto out; } old_ldisc = tty->ldisc; /* Shutdown the old discipline. */ tty_ldisc_close(tty, old_ldisc); /* Now set up the new line discipline. */ tty->ldisc = new_ldisc; tty_set_termios_ldisc(tty, disc); retval = tty_ldisc_open(tty, new_ldisc); if (retval < 0) { /* Back to the old one or N_TTY if we can't */ tty_ldisc_put(new_ldisc); tty_ldisc_restore(tty, old_ldisc); } if (tty->ldisc->ops->num != old_ldisc->ops->num && tty->ops->set_ldisc) { down_read(&tty->termios_rwsem); tty->ops->set_ldisc(tty); up_read(&tty->termios_rwsem); } /* * At this point we hold a reference to the new ldisc and a * reference to the old ldisc, or we hold two references to * the old ldisc (if it was restored as part of error cleanup * above). In either case, releasing a single reference from * the old ldisc is correct. */ new_ldisc = old_ldisc; out: tty_ldisc_unlock(tty); /* * Restart the work queue in case no characters kick it off. Safe if * already running */ tty_buffer_restart_work(tty->port); err: tty_ldisc_put(new_ldisc); /* drop the extra reference */ tty_unlock(tty); return retval; } EXPORT_SYMBOL_GPL(tty_set_ldisc); /** * tty_ldisc_kill - teardown ldisc * @tty: tty being released * * Perform final close of the ldisc and reset @tty->ldisc */ static void tty_ldisc_kill(struct tty_struct *tty) { lockdep_assert_held_write(&tty->ldisc_sem); if (!tty->ldisc) return; /* * Now kill off the ldisc */ tty_ldisc_close(tty, tty->ldisc); tty_ldisc_put(tty->ldisc); /* Force an oops if we mess this up */ tty->ldisc = NULL; } /** * tty_reset_termios - reset terminal state * @tty: tty to reset * * Restore a terminal to the driver default state. */ static void tty_reset_termios(struct tty_struct *tty) { down_write(&tty->termios_rwsem); tty->termios = tty->driver->init_termios; tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); up_write(&tty->termios_rwsem); } /** * tty_ldisc_reinit - reinitialise the tty ldisc * @tty: tty to reinit * @disc: line discipline to reinitialize * * Completely reinitialize the line discipline state, by closing the current * instance, if there is one, and opening a new instance. If an error occurs * opening the new non-%N_TTY instance, the instance is dropped and @tty->ldisc * reset to %NULL. The caller can then retry with %N_TTY instead. * * Returns: 0 if successful, otherwise error code < 0 */ int tty_ldisc_reinit(struct tty_struct *tty, int disc) { struct tty_ldisc *ld; int retval; lockdep_assert_held_write(&tty->ldisc_sem); ld = tty_ldisc_get(tty, disc); if (IS_ERR(ld)) { BUG_ON(disc == N_TTY); return PTR_ERR(ld); } if (tty->ldisc) { tty_ldisc_close(tty, tty->ldisc); tty_ldisc_put(tty->ldisc); } /* switch the line discipline */ tty->ldisc = ld; tty_set_termios_ldisc(tty, disc); retval = tty_ldisc_open(tty, tty->ldisc); if (retval) { tty_ldisc_put(tty->ldisc); tty->ldisc = NULL; } return retval; } /** * tty_ldisc_hangup - hangup ldisc reset * @tty: tty being hung up * @reinit: whether to re-initialise the tty * * Some tty devices reset their termios when they receive a hangup event. In * that situation we must also switch back to %N_TTY properly before we reset * the termios data. * * Locking: We can take the ldisc mutex as the rest of the code is careful to * allow for this. * * In the pty pair case this occurs in the close() path of the tty itself so we * must be careful about locking rules. */ void tty_ldisc_hangup(struct tty_struct *tty, bool reinit) { struct tty_ldisc *ld; tty_ldisc_debug(tty, "%p: hangup\n", tty->ldisc); ld = tty_ldisc_ref(tty); if (ld != NULL) { if (ld->ops->flush_buffer) ld->ops->flush_buffer(tty); tty_driver_flush_buffer(tty); if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && ld->ops->write_wakeup) ld->ops->write_wakeup(tty); if (ld->ops->hangup) ld->ops->hangup(tty); tty_ldisc_deref(ld); } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); /* * Shutdown the current line discipline, and reset it to * N_TTY if need be. * * Avoid racing set_ldisc or tty_ldisc_release */ tty_ldisc_lock(tty, MAX_SCHEDULE_TIMEOUT); if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty_reset_termios(tty); if (tty->ldisc) { if (reinit) { if (tty_ldisc_reinit(tty, tty->termios.c_line) < 0 && tty_ldisc_reinit(tty, N_TTY) < 0) WARN_ON(tty_ldisc_reinit(tty, N_NULL) < 0); } else tty_ldisc_kill(tty); } tty_ldisc_unlock(tty); } /** * tty_ldisc_setup - open line discipline * @tty: tty being shut down * @o_tty: pair tty for pty/tty pairs * * Called during the initial open of a tty/pty pair in order to set up the line * disciplines and bind them to the @tty. This has no locking issues as the * device isn't yet active. */ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty) { int retval = tty_ldisc_open(tty, tty->ldisc); if (retval) return retval; if (o_tty) { /* * Called without o_tty->ldisc_sem held, as o_tty has been * just allocated and no one has a reference to it. */ retval = tty_ldisc_open(o_tty, o_tty->ldisc); if (retval) { tty_ldisc_close(tty, tty->ldisc); return retval; } } return 0; } /** * tty_ldisc_release - release line discipline * @tty: tty being shut down (or one end of pty pair) * * Called during the final close of a tty or a pty pair in order to shut down * the line discpline layer. On exit, each tty's ldisc is %NULL. */ void tty_ldisc_release(struct tty_struct *tty) { struct tty_struct *o_tty = tty->link; /* * Shutdown this line discipline. As this is the final close, * it does not race with the set_ldisc code path. */ tty_ldisc_lock_pair(tty, o_tty); tty_ldisc_kill(tty); if (o_tty) tty_ldisc_kill(o_tty); tty_ldisc_unlock_pair(tty, o_tty); /* * And the memory resources remaining (buffers, termios) will be * disposed of when the kref hits zero */ tty_ldisc_debug(tty, "released\n"); } /** * tty_ldisc_init - ldisc setup for new tty * @tty: tty being allocated * * Set up the line discipline objects for a newly allocated tty. Note that the * tty structure is not completely set up when this call is made. */ int tty_ldisc_init(struct tty_struct *tty) { struct tty_ldisc *ld = tty_ldisc_get(tty, N_TTY); if (IS_ERR(ld)) return PTR_ERR(ld); tty->ldisc = ld; return 0; } /** * tty_ldisc_deinit - ldisc cleanup for new tty * @tty: tty that was allocated recently * * The tty structure must not be completely set up (tty_ldisc_setup()) when * this call is made. */ void tty_ldisc_deinit(struct tty_struct *tty) { /* no ldisc_sem, tty is being destroyed */ if (tty->ldisc) tty_ldisc_put(tty->ldisc); tty->ldisc = NULL; }
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 10 1 9 9 9 9 9 10 10 10 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * Copyright (C) 2016-2023 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/pagemap.h> #include <linux/uio.h> #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> #include "internal.h" #include "trace.h" #include "../internal.h" /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. */ struct iomap_folio_state { spinlock_t state_lock; unsigned int read_bytes_pending; atomic_t write_bytes_pending; /* * Each block has two bits in this bitmap: * Bits [0..blocks_per_folio) has the uptodate status. * Bits [b_p_f...(2*b_p_f)) has the dirty status. */ unsigned long state[]; }; static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { struct inode *inode = folio->mapping->host; return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, unsigned int block) { return test_bit(block, ifs->state); } static bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int first_blk = off >> inode->i_blkbits; unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; bitmap_set(ifs->state, first_blk, nr_blks); return ifs_is_fully_uptodate(folio, ifs); } static void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; unsigned long flags; bool uptodate = true; if (ifs) { spin_lock_irqsave(&ifs->state_lock, flags); uptodate = ifs_set_range_uptodate(folio, ifs, off, len); spin_unlock_irqrestore(&ifs->state_lock, flags); } if (uptodate) folio_mark_uptodate(folio); } static inline bool ifs_block_is_dirty(struct folio *folio, struct iomap_folio_state *ifs, int block) { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); return test_bit(block + blks_per_folio, ifs->state); } static unsigned ifs_find_dirty_range(struct folio *folio, struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) { struct inode *inode = folio->mapping->host; unsigned start_blk = offset_in_folio(folio, *range_start) >> inode->i_blkbits; unsigned end_blk = min_not_zero( offset_in_folio(folio, range_end) >> inode->i_blkbits, i_blocks_per_folio(inode, folio)); unsigned nblks = 1; while (!ifs_block_is_dirty(folio, ifs, start_blk)) if (++start_blk == end_blk) return 0; while (start_blk + nblks < end_blk) { if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) break; nblks++; } *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); return nblks << inode->i_blkbits; } static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, u64 range_end) { struct iomap_folio_state *ifs = folio->private; if (*range_start >= range_end) return 0; if (ifs) return ifs_find_dirty_range(folio, ifs, range_start, range_end); return range_end - *range_start; } static void ifs_clear_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); unsigned int first_blk = (off >> inode->i_blkbits); unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; unsigned long flags; spin_lock_irqsave(&ifs->state_lock, flags); bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); spin_unlock_irqrestore(&ifs->state_lock, flags); } static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; if (ifs) ifs_clear_range_dirty(folio, ifs, off, len); } static void ifs_set_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); unsigned int first_blk = (off >> inode->i_blkbits); unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; unsigned long flags; spin_lock_irqsave(&ifs->state_lock, flags); bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); spin_unlock_irqrestore(&ifs->state_lock, flags); } static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; if (ifs) ifs_set_range_dirty(folio, ifs, off, len); } static struct iomap_folio_state *ifs_alloc(struct inode *inode, struct folio *folio, unsigned int flags) { struct iomap_folio_state *ifs = folio->private; unsigned int nr_blocks = i_blocks_per_folio(inode, folio); gfp_t gfp; if (ifs || nr_blocks <= 1) return ifs; if (flags & IOMAP_NOWAIT) gfp = GFP_NOWAIT; else gfp = GFP_NOFS | __GFP_NOFAIL; /* * ifs->state tracks two sets of state flags when the * filesystem block size is smaller than the folio size. * The first state tracks per-block uptodate and the * second tracks per-block dirty state. */ ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(2 * nr_blocks)), gfp); if (!ifs) return ifs; spin_lock_init(&ifs->state_lock); if (folio_test_uptodate(folio)) bitmap_set(ifs->state, 0, nr_blocks); if (folio_test_dirty(folio)) bitmap_set(ifs->state, nr_blocks, nr_blocks); folio_attach_private(folio, ifs); return ifs; } static void ifs_free(struct folio *folio) { struct iomap_folio_state *ifs = folio_detach_private(folio); if (!ifs) return; WARN_ON_ONCE(ifs->read_bytes_pending != 0); WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != folio_test_uptodate(folio)); kfree(ifs); } /* * Calculate the range inside the folio that we actually need to read. */ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, loff_t *pos, loff_t length, size_t *offp, size_t *lenp) { struct iomap_folio_state *ifs = folio->private; loff_t orig_pos = *pos; loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); size_t poff = offset_in_folio(folio, *pos); size_t plen = min_t(loff_t, folio_size(folio) - poff, length); size_t orig_plen = plen; unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; /* * If the block size is smaller than the page size, we need to check the * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ if (ifs) { unsigned int i; /* move forward for each leading block marked uptodate */ for (i = first; i <= last; i++) { if (!ifs_block_is_uptodate(ifs, i)) break; *pos += block_size; poff += block_size; plen -= block_size; first++; } /* truncate len if we find any trailing uptodate block(s) */ while (++i <= last) { if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; break; } } } /* * If the extent spans the block that contains the i_size, we need to * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ if (orig_pos <= isize && orig_pos + orig_plen > isize) { unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) plen -= (last - end) * block_size; } *offp = poff; *lenp = plen; } static void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error) { struct iomap_folio_state *ifs = folio->private; bool uptodate = !error; bool finished = true; if (ifs) { unsigned long flags; spin_lock_irqsave(&ifs->state_lock, flags); if (!error) uptodate = ifs_set_range_uptodate(folio, ifs, off, len); ifs->read_bytes_pending -= len; finished = !ifs->read_bytes_pending; spin_unlock_irqrestore(&ifs->state_lock, flags); } if (finished) folio_end_read(folio, uptodate); } static void iomap_read_end_io(struct bio *bio) { int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; bio_for_each_folio_all(fi, bio) iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); bio_put(bio); } struct iomap_readpage_ctx { struct folio *cur_folio; bool cur_folio_in_bio; struct bio *bio; struct readahead_control *rac; }; /** * iomap_read_inline_data - copy inline data into the page cache * @iter: iteration structure * @folio: folio to copy to * * Copy the inline data in @iter into @folio and zero out the rest of the folio. * Only a single IOMAP_INLINE extent is allowed at the end of each file. * Returns zero for success to complete the read, or the usual negative errno. */ static int iomap_read_inline_data(const struct iomap_iter *iter, struct folio *folio) { const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t offset = offset_in_folio(folio, iomap->offset); if (folio_test_uptodate(folio)) return 0; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) ifs_alloc(iter->inode, folio, iter->flags); folio_fill_tail(folio, offset, iomap->inline_data, size); iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset); return 0; } static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, loff_t pos) { const struct iomap *srcmap = iomap_iter_srcmap(iter); return srcmap->type != IOMAP_MAPPED || (srcmap->flags & IOMAP_F_NEW) || pos >= i_size_read(iter->inode); } static int iomap_readpage_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; struct iomap_folio_state *ifs; size_t poff, plen; sector_t sector; int ret; if (iomap->type == IOMAP_INLINE) { ret = iomap_read_inline_data(iter, folio); if (ret) return ret; return iomap_iter_advance(iter, &length); } /* zero post-eof blocks as the page may be mapped */ ifs = ifs_alloc(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; if (iomap_block_needs_zeroing(iter, pos)) { folio_zero_range(folio, poff, plen); iomap_set_range_uptodate(folio, poff, plen); goto done; } ctx->cur_folio_in_bio = true; if (ifs) { spin_lock_irq(&ifs->state_lock); ifs->read_bytes_pending += plen; spin_unlock_irq(&ifs->state_lock); } sector = iomap_sector(iomap, pos); if (!ctx->bio || bio_end_sector(ctx->bio) != sector || !bio_add_folio(ctx->bio, folio, plen, poff)) { gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); if (ctx->bio) submit_bio(ctx->bio); if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, gfp); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates * what do_mpage_read_folio does. */ if (!ctx->bio) { ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); } if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; ctx->bio->bi_end_io = iomap_read_end_io; bio_add_folio_nofail(ctx->bio, folio, plen, poff); } done: /* * Move the caller beyond our range so that it keeps making progress. * For that, we have to include any leading non-uptodate ranges, but * we can skip trailing ones as they will be handled in the next * iteration. */ length = pos - iter->pos + plen; return iomap_iter_advance(iter, &length); } static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { int ret; while (iomap_length(iter)) { ret = iomap_readpage_iter(iter, ctx); if (ret) return ret; } return 0; } int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = folio->mapping->host, .pos = folio_pos(folio), .len = folio_size(folio), }; struct iomap_readpage_ctx ctx = { .cur_folio = folio, }; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) iter.status = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); WARN_ON_ONCE(!ctx.cur_folio_in_bio); } else { WARN_ON_ONCE(ctx.cur_folio_in_bio); folio_unlock(folio); } /* * Just like mpage_readahead and block_read_full_folio, we always * return 0 and just set the folio error flag on errors. This * should be cleaned up throughout the stack eventually. */ return 0; } EXPORT_SYMBOL_GPL(iomap_read_folio); static int iomap_readahead_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { int ret; while (iomap_length(iter)) { if (ctx->cur_folio && offset_in_folio(ctx->cur_folio, iter->pos) == 0) { if (!ctx->cur_folio_in_bio) folio_unlock(ctx->cur_folio); ctx->cur_folio = NULL; } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); ctx->cur_folio_in_bio = false; } ret = iomap_readpage_iter(iter, ctx); if (ret) return ret; } return 0; } /** * iomap_readahead - Attempt to read pages from a file. * @rac: Describes the pages to be read. * @ops: The operations vector for the filesystem. * * This function is for filesystems to call to implement their readahead * address_space operation. * * Context: The @ops callbacks may submit I/O (eg to read the addresses of * blocks from disc), and may wait for it. The caller may be trying to * access a different page, and so sleeping excessively should be avoided. * It may allocate memory, but should avoid costly allocations. This * function is called with memalloc_nofs set, so allocations will not cause * the filesystem to be reentered. */ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { struct iomap_iter iter = { .inode = rac->mapping->host, .pos = readahead_pos(rac), .len = readahead_length(rac), }; struct iomap_readpage_ctx ctx = { .rac = rac, }; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) iter.status = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_folio) { if (!ctx.cur_folio_in_bio) folio_unlock(ctx.cur_folio); } } EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a folio are * uptodate or not. * * Returns true if all blocks which correspond to the specified part * of the folio are uptodate. */ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; unsigned first, last, i; if (!ifs) return false; /* Caller's range may extend past the end of this folio */ count = min(folio_size(folio) - from, count); /* First and last blocks in range within folio */ first = from >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits; for (i = first; i <= last; i++) if (!ifs_block_is_uptodate(ifs, i)) return false; return true; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); /** * iomap_get_folio - get a folio reference for writing * @iter: iteration structure * @pos: start offset of write * @len: Suggested size of folio to create. * * Returns a locked reference to the folio at @pos, or an error pointer if the * folio could not be obtained. */ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; if (iter->flags & IOMAP_DONTCACHE) fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); } EXPORT_SYMBOL_GPL(iomap_get_folio); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) { trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), folio_size(folio)); /* * If the folio is dirty, we refuse to release our metadata because * it may be partially dirty. Once we track per-block dirty state, * we can release the metadata if every block is dirty. */ if (folio_test_dirty(folio)) return false; ifs_free(folio); return true; } EXPORT_SYMBOL_GPL(iomap_release_folio); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) { trace_iomap_invalidate_folio(folio->mapping->host, folio_pos(folio) + offset, len); /* * If we're invalidating the entire folio, clear the dirty state * from it and release it to avoid unnecessary buildup of the LRU. */ if (offset == 0 && len == folio_size(folio)) { WARN_ON_ONCE(folio_test_writeback(folio)); folio_cancel_dirty(folio); ifs_free(folio); } } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; size_t len = folio_size(folio); ifs_alloc(inode, folio, 0); iomap_set_range_dirty(folio, 0, len); return filemap_dirty_folio(mapping, folio); } EXPORT_SYMBOL_GPL(iomap_dirty_folio); static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { loff_t i_size = i_size_read(inode); /* * Only truncate newly allocated pages beyoned EOF, even if the * write started inside the existing inode size. */ if (pos + len > i_size) truncate_pagecache_range(inode, max(pos, i_size), pos + len - 1); } static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, size_t poff, size_t plen, const struct iomap *iomap) { struct bio_vec bvec; struct bio bio; bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); bio_add_folio_nofail(&bio, folio, plen, poff); return submit_bio_wait(&bio); } static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); struct iomap_folio_state *ifs; loff_t pos = iter->pos; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; /* * If the write or zeroing completely overlaps the current folio, then * entire folio will be dirtied so there is no need for * per-block state tracking structures to be attached to this folio. * For the unshare case, we must read in the ondisk contents because we * are not changing pagecache contents. */ if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && pos + len >= folio_pos(folio) + folio_size(folio)) return 0; ifs = ifs_alloc(iter->inode, folio, iter->flags); if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) return -EAGAIN; if (folio_test_uptodate(folio)) return 0; do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); if (plen == 0) break; if (!(iter->flags & IOMAP_UNSHARE) && (from <= poff || from >= poff + plen) && (to <= poff || to >= poff + plen)) continue; if (iomap_block_needs_zeroing(iter, block_start)) { if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) return -EIO; folio_zero_segments(folio, poff, from, to, poff + plen); } else { int status; if (iter->flags & IOMAP_NOWAIT) return -EAGAIN; status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; } iomap_set_range_uptodate(folio, poff, plen); } while ((block_start += plen) < block_end); return 0; } static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; loff_t pos = iter->pos; if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); else return iomap_get_folio(iter, pos, len); } static void __iomap_put_folio(struct iomap_iter *iter, size_t ret, struct folio *folio) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; loff_t pos = iter->pos; if (folio_ops && folio_ops->put_folio) { folio_ops->put_folio(iter->inode, pos, ret, folio); } else { folio_unlock(folio); folio_put(folio); } } /* trim pos and bytes to within a given folio */ static loff_t iomap_trim_folio_range(struct iomap_iter *iter, struct folio *folio, size_t *offset, u64 *bytes) { loff_t pos = iter->pos; size_t fsize = folio_size(folio); WARN_ON_ONCE(pos < folio_pos(folio)); WARN_ON_ONCE(pos >= folio_pos(folio) + fsize); *offset = offset_in_folio(folio, pos); *bytes = min(*bytes, fsize - *offset); return pos; } static int iomap_write_begin_inline(const struct iomap_iter *iter, struct folio *folio) { /* needs more work for the tailpacking case; disable for now */ if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) return -EIO; return iomap_read_inline_data(iter, folio); } /* * Grab and prepare a folio for write based on iter state. Returns the folio, * offset, and length. Callers can optionally pass a max length *plen, * otherwise init to zero. */ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, size_t *poffset, u64 *plen) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); struct folio *folio; int status = 0; len = min_not_zero(len, *plen); BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); if (fatal_signal_pending(current)) return -EINTR; folio = __iomap_get_folio(iter, len); if (IS_ERR(folio)) return PTR_ERR(folio); /* * Now we have a locked folio, before we do anything with it we need to * check that the iomap we have cached is not stale. The inode extent * mapping can change due to concurrent IO in flight (e.g. * IOMAP_UNWRITTEN state can change and memory reclaim could have * reclaimed a previously partially written page at this index after IO * completion before this write reaches this file offset) and hence we * could do the wrong thing here (zero a page range incorrectly or fail * to zero) and corrupt data. */ if (folio_ops && folio_ops->iomap_valid) { bool iomap_valid = folio_ops->iomap_valid(iter->inode, &iter->iomap); if (!iomap_valid) { iter->iomap.flags |= IOMAP_F_STALE; status = 0; goto out_unlock; } } pos = iomap_trim_folio_range(iter, folio, poffset, &len); if (srcmap->type == IOMAP_INLINE) status = iomap_write_begin_inline(iter, folio); else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(folio, pos, len, NULL, srcmap); else status = __iomap_write_begin(iter, len, folio); if (unlikely(status)) goto out_unlock; *foliop = folio; *plen = len; return 0; out_unlock: __iomap_put_folio(iter, 0, folio); return status; } static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { flush_dcache_folio(folio); /* * The blocks that were entirely written will now be uptodate, so we * don't have to worry about a read_folio reading them and overwriting a * partial write. However, if we've encountered a short write and only * partially written into a block, it will not be marked uptodate, so a * read_folio might come in and destroy our partial write. * * Do the simplest thing and just treat any short write to a * non-uptodate page as a zero-length write, and force the caller to * redo the whole thing. */ if (unlikely(copied < len && !folio_test_uptodate(folio))) return false; iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); return true; } static void iomap_write_end_inline(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t copied) { const struct iomap *iomap = &iter->iomap; void *addr; WARN_ON_ONCE(!folio_test_uptodate(folio)); BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_folio(folio); addr = kmap_local_folio(folio, pos); memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); mark_inode_dirty(iter->inode); } /* * Returns true if all copied bytes have been written to the pagecache, * otherwise return false. */ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; if (srcmap->type == IOMAP_INLINE) { iomap_write_end_inline(iter, folio, pos, copied); return true; } if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, len, copied, folio, NULL); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } return __iomap_write_end(iter->inode, pos, len, copied, folio); } static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { ssize_t total_written = 0; int status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { struct folio *folio; loff_t old_size; size_t offset; /* Offset into folio */ u64 bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ u64 written; /* Bytes have been written */ loff_t pos; bytes = iov_iter_count(i); retry: offset = iter->pos & (chunk - 1); bytes = min(chunk - offset, bytes); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) break; if (bytes > iomap_length(iter)) bytes = iomap_length(iter); /* * Bring in the user page that we'll copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. * * For async buffered writes the assumption is that the user * page has already been faulted in. This can be optimized by * faulting the user page. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; break; } status = iomap_write_begin(iter, &folio, &offset, &bytes); if (unlikely(status)) { iomap_write_failed(iter->inode, iter->pos, bytes); break; } if (iter->iomap.flags & IOMAP_F_STALE) break; pos = iter->pos; if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); written = iomap_write_end(iter, bytes, copied, folio) ? copied : 0; /* * Update the in-memory inode size after copying the data into * the page cache. It's up to the file system to write the * updated size to disk, preferably after I/O completion so that * no stale data is exposed. Only once that's done can we * unlock and release the folio. */ old_size = iter->inode->i_size; if (pos + written > old_size) { i_size_write(iter->inode, pos + written); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } __iomap_put_folio(iter, written, folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); cond_resched(); if (unlikely(written == 0)) { /* * A short copy made iomap_write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ iomap_write_failed(iter->inode, pos, bytes); iov_iter_revert(i, copied); if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { bytes = copied; goto retry; } } else { total_written += written; iomap_iter_advance(iter, &written); } } while (iov_iter_count(i) && iomap_length(iter)); return total_written ? 0 : status; } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(i), .flags = IOMAP_WRITE, .private = private, }; ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; if (iocb->ki_flags & IOCB_DONTCACHE) iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) iter.status = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; ret = iter.pos - iocb->ki_pos; iocb->ki_pos = iter.pos; return ret; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { unsigned int first_blk, last_blk, i; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; /* * When we have per-block dirty tracking, there can be * blocks within a folio which are marked uptodate * but not dirty. In that case it is necessary to punch * out such blocks to avoid leaking any delalloc blocks. */ ifs = folio->private; if (!ifs) return; last_byte = min_t(loff_t, end_byte - 1, folio_pos(folio) + folio_size(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; for (i = first_blk; i <= last_blk; i++) { if (!ifs_block_is_dirty(folio, ifs, i)) punch(inode, folio_pos(folio) + (i << blkbits), 1 << blkbits, iomap); } } static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { if (!folio_test_dirty(folio)) return; /* if dirty, punch up to offset */ if (start_byte > *punch_start_byte) { punch(inode, *punch_start_byte, start_byte - *punch_start_byte, iomap); } /* Punch non-dirty blocks within folio */ iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte, iomap, punch); /* * Make sure the next punch start is correctly bound to * the end of this data range, not the end of the folio. */ *punch_start_byte = min_t(loff_t, end_byte, folio_pos(folio) + folio_size(folio)); } /* * Scan the data range passed to us for dirty page cache folios. If we find a * dirty folio, punch out the preceding range and update the offset from which * the next punch will start from. * * We can punch out storage reservations under clean pa