2 1 2 2 2 2 1 48 9 39 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 | // SPDX-License-Identifier: LGPL-2.1 /* * A V4L2 frontend for the FWHT codec * * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/string.h> #include <linux/videodev2.h> #include "codec-v4l2-fwht.h" static const struct v4l2_fwht_pixfmt_info v4l2_fwht_pixfmts[] = { { V4L2_PIX_FMT_YUV420, 1, 3, 2, 1, 1, 2, 2, 3, 3, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YVU420, 1, 3, 2, 1, 1, 2, 2, 3, 3, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YUV422P, 1, 2, 1, 1, 1, 2, 1, 3, 3, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV12, 1, 3, 2, 1, 2, 2, 2, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV21, 1, 3, 2, 1, 2, 2, 2, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV16, 1, 2, 1, 1, 2, 2, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV61, 1, 2, 1, 1, 2, 2, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV24, 1, 3, 1, 1, 2, 1, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_NV42, 1, 3, 1, 1, 2, 1, 1, 3, 2, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YUYV, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_YVYU, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_UYVY, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_VYUY, 2, 2, 1, 2, 4, 2, 1, 3, 1, V4L2_FWHT_FL_PIXENC_YUV}, { V4L2_PIX_FMT_BGR24, 3, 3, 1, 3, 3, 1, 1, 3, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGB24, 3, 3, 1, 3, 3, 1, 1, 3, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_HSV24, 3, 3, 1, 3, 3, 1, 1, 3, 1, V4L2_FWHT_FL_PIXENC_HSV}, { V4L2_PIX_FMT_BGR32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_XBGR32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_ABGR32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGB32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_XRGB32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_ARGB32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_BGRX32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_BGRA32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGBX32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_RGBA32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_RGB}, { V4L2_PIX_FMT_HSV32, 4, 4, 1, 4, 4, 1, 1, 4, 1, V4L2_FWHT_FL_PIXENC_HSV}, { V4L2_PIX_FMT_GREY, 1, 1, 1, 1, 0, 1, 1, 1, 1, V4L2_FWHT_FL_PIXENC_RGB}, }; bool v4l2_fwht_validate_fmt(const struct v4l2_fwht_pixfmt_info *info, u32 width_div, u32 height_div, u32 components_num, u32 pixenc) { if (info->width_div == width_div && info->height_div == height_div && (!pixenc || info->pixenc == pixenc) && info->components_num == components_num) return true; return false; } const struct v4l2_fwht_pixfmt_info *v4l2_fwht_find_nth_fmt(u32 width_div, u32 height_div, u32 components_num, u32 pixenc, unsigned int start_idx) { unsigned int i; for (i = 0; i < ARRAY_SIZE(v4l2_fwht_pixfmts); i++) { bool is_valid = v4l2_fwht_validate_fmt(&v4l2_fwht_pixfmts[i], width_div, height_div, components_num, pixenc); if (is_valid) { if (start_idx == 0) return v4l2_fwht_pixfmts + i; start_idx--; } } return NULL; } const struct v4l2_fwht_pixfmt_info *v4l2_fwht_find_pixfmt(u32 pixelformat) { unsigned int i; for (i = 0; i < ARRAY_SIZE(v4l2_fwht_pixfmts); i++) if (v4l2_fwht_pixfmts[i].id == pixelformat) return v4l2_fwht_pixfmts + i; return NULL; } const struct v4l2_fwht_pixfmt_info *v4l2_fwht_get_pixfmt(u32 idx) { if (idx >= ARRAY_SIZE(v4l2_fwht_pixfmts)) return NULL; return v4l2_fwht_pixfmts + idx; } static int prepare_raw_frame(struct fwht_raw_frame *rf, const struct v4l2_fwht_pixfmt_info *info, u8 *buf, unsigned int size) { rf->luma = buf; rf->width_div = info->width_div; rf->height_div = info->height_div; rf->luma_alpha_step = info->luma_alpha_step; rf->chroma_step = info->chroma_step; rf->alpha = NULL; rf->components_num = info->components_num; /* * The buffer is NULL if it is the reference * frame of an I-frame in the stateless decoder */ if (!buf) { rf->luma = NULL; rf->cb = NULL; rf->cr = NULL; rf->alpha = NULL; return 0; } switch (info->id) { case V4L2_PIX_FMT_GREY: rf->cb = NULL; rf->cr = NULL; break; case V4L2_PIX_FMT_YUV420: rf->cb = rf->luma + size; rf->cr = rf->cb + size / 4; break; case V4L2_PIX_FMT_YVU420: rf->cr = rf->luma + size; rf->cb = rf->cr + size / 4; break; case V4L2_PIX_FMT_YUV422P: rf->cb = rf->luma + size; rf->cr = rf->cb + size / 2; break; case V4L2_PIX_FMT_NV12: case V4L2_PIX_FMT_NV16: case V4L2_PIX_FMT_NV24: rf->cb = rf->luma + size; rf->cr = rf->cb + 1; break; case V4L2_PIX_FMT_NV21: case V4L2_PIX_FMT_NV61: case V4L2_PIX_FMT_NV42: rf->cr = rf->luma + size; rf->cb = rf->cr + 1; break; case V4L2_PIX_FMT_YUYV: rf->cb = rf->luma + 1; rf->cr = rf->cb + 2; break; case V4L2_PIX_FMT_YVYU: rf->cr = rf->luma + 1; rf->cb = rf->cr + 2; break; case V4L2_PIX_FMT_UYVY: rf->cb = rf->luma; rf->cr = rf->cb + 2; rf->luma++; break; case V4L2_PIX_FMT_VYUY: rf->cr = rf->luma; rf->cb = rf->cr + 2; rf->luma++; break; case V4L2_PIX_FMT_RGB24: case V4L2_PIX_FMT_HSV24: rf->cr = rf->luma; rf->cb = rf->cr + 2; rf->luma++; break; case V4L2_PIX_FMT_BGR24: rf->cb = rf->luma; rf->cr = rf->cb + 2; rf->luma++; break; case V4L2_PIX_FMT_RGB32: case V4L2_PIX_FMT_XRGB32: case V4L2_PIX_FMT_HSV32: case V4L2_PIX_FMT_ARGB32: rf->alpha = rf->luma; rf->cr = rf->luma + 1; rf->cb = rf->cr + 2; rf->luma += 2; break; case V4L2_PIX_FMT_BGR32: case V4L2_PIX_FMT_XBGR32: case V4L2_PIX_FMT_ABGR32: rf->cb = rf->luma; rf->cr = rf->cb + 2; rf->luma++; rf->alpha = rf->cr + 1; break; case V4L2_PIX_FMT_BGRX32: case V4L2_PIX_FMT_BGRA32: rf->alpha = rf->luma; rf->cb = rf->luma + 1; rf->cr = rf->cb + 2; rf->luma += 2; break; case V4L2_PIX_FMT_RGBX32: case V4L2_PIX_FMT_RGBA32: rf->alpha = rf->luma + 3; rf->cr = rf->luma; rf->cb = rf->cr + 2; rf->luma++; break; default: return -EINVAL; } return 0; } int v4l2_fwht_encode(struct v4l2_fwht_state *state, u8 *p_in, u8 *p_out) { unsigned int size = state->stride * state->coded_height; unsigned int chroma_stride = state->stride; const struct v4l2_fwht_pixfmt_info *info = state->info; struct fwht_cframe_hdr *p_hdr; struct fwht_cframe cf; struct fwht_raw_frame rf; u32 encoding; u32 flags = 0; if (!info) return -EINVAL; if (prepare_raw_frame(&rf, info, p_in, size)) return -EINVAL; if (info->planes_num == 3) chroma_stride /= 2; if (info->id == V4L2_PIX_FMT_NV24 || info->id == V4L2_PIX_FMT_NV42) chroma_stride *= 2; cf.i_frame_qp = state->i_frame_qp; cf.p_frame_qp = state->p_frame_qp; cf.rlc_data = (__be16 *)(p_out + sizeof(*p_hdr)); encoding = fwht_encode_frame(&rf, &state->ref_frame, &cf, !state->gop_cnt, state->gop_cnt == state->gop_size - 1, state->visible_width, state->visible_height, state->stride, chroma_stride); if (!(encoding & FWHT_FRAME_PCODED)) state->gop_cnt = 0; if (++state->gop_cnt >= state->gop_size) state->gop_cnt = 0; p_hdr = (struct fwht_cframe_hdr *)p_out; p_hdr->magic1 = FWHT_MAGIC1; p_hdr->magic2 = FWHT_MAGIC2; p_hdr->version = htonl(V4L2_FWHT_VERSION); p_hdr->width = htonl(state->visible_width); p_hdr->height = htonl(state->visible_height); flags |= (info->components_num - 1) << V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET; flags |= info->pixenc; if (encoding & FWHT_LUMA_UNENCODED) flags |= V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED; if (encoding & FWHT_CB_UNENCODED) flags |= V4L2_FWHT_FL_CB_IS_UNCOMPRESSED; if (encoding & FWHT_CR_UNENCODED) flags |= V4L2_FWHT_FL_CR_IS_UNCOMPRESSED; if (encoding & FWHT_ALPHA_UNENCODED) flags |= V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED; if (!(encoding & FWHT_FRAME_PCODED)) flags |= V4L2_FWHT_FL_I_FRAME; if (rf.height_div == 1) flags |= V4L2_FWHT_FL_CHROMA_FULL_HEIGHT; if (rf.width_div == 1) flags |= V4L2_FWHT_FL_CHROMA_FULL_WIDTH; p_hdr->flags = htonl(flags); p_hdr->colorspace = htonl(state->colorspace); p_hdr->xfer_func = htonl(state->xfer_func); p_hdr->ycbcr_enc = htonl(state->ycbcr_enc); p_hdr->quantization = htonl(state->quantization); p_hdr->size = htonl(cf.size); return cf.size + sizeof(*p_hdr); } int v4l2_fwht_decode(struct v4l2_fwht_state *state, u8 *p_in, u8 *p_out) { u32 flags; struct fwht_cframe cf; unsigned int components_num = 3; unsigned int version; const struct v4l2_fwht_pixfmt_info *info; unsigned int hdr_width_div, hdr_height_div; struct fwht_raw_frame dst_rf; unsigned int dst_chroma_stride = state->stride; unsigned int ref_chroma_stride = state->ref_stride; unsigned int dst_size = state->stride * state->coded_height; unsigned int ref_size; if (!state->info) return -EINVAL; info = state->info; version = ntohl(state->header.version); if (!version || version > V4L2_FWHT_VERSION) { pr_err("version %d is not supported, current version is %d\n", version, V4L2_FWHT_VERSION); return -EINVAL; } if (state->header.magic1 != FWHT_MAGIC1 || state->header.magic2 != FWHT_MAGIC2) return -EINVAL; /* TODO: support resolution changes */ if (ntohl(state->header.width) != state->visible_width || ntohl(state->header.height) != state->visible_height) return -EINVAL; flags = ntohl(state->header.flags); if (version >= 2) { if ((flags & V4L2_FWHT_FL_PIXENC_MSK) != info->pixenc) return -EINVAL; components_num = 1 + ((flags & V4L2_FWHT_FL_COMPONENTS_NUM_MSK) >> V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET); } if (components_num != info->components_num) return -EINVAL; state->colorspace = ntohl(state->header.colorspace); state->xfer_func = ntohl(state->header.xfer_func); state->ycbcr_enc = ntohl(state->header.ycbcr_enc); state->quantization = ntohl(state->header.quantization); cf.rlc_data = (__be16 *)p_in; cf.size = ntohl(state->header.size); hdr_width_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH) ? 1 : 2; hdr_height_div = (flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT) ? 1 : 2; if (hdr_width_div != info->width_div || hdr_height_div != info->height_div) return -EINVAL; if (prepare_raw_frame(&dst_rf, info, p_out, dst_size)) return -EINVAL; if (info->planes_num == 3) { dst_chroma_stride /= 2; ref_chroma_stride /= 2; } if (info->id == V4L2_PIX_FMT_NV24 || info->id == V4L2_PIX_FMT_NV42) { dst_chroma_stride *= 2; ref_chroma_stride *= 2; } ref_size = state->ref_stride * state->coded_height; if (prepare_raw_frame(&state->ref_frame, info, state->ref_frame.buf, ref_size)) return -EINVAL; if (!fwht_decode_frame(&cf, flags, components_num, state->visible_width, state->visible_height, &state->ref_frame, state->ref_stride, ref_chroma_stride, &dst_rf, state->stride, dst_chroma_stride)) return -EINVAL; return 0; } |
344 366 366 348 109 329 344 1 338 12 12 146 2 109 108 109 79 195 118 118 1 118 119 113 96 1 97 72 75 77 73 72 79 79 118 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) { error = -EBUSY; goto err; } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) goto err; kobject_get(p->kobj.parent); return 0; err: kfree_const(p->kobj.name); p->kobj.name = NULL; return error; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @dev: the device structure * @cdev: the cdev structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); |
12 12 32 7 20 11 2 15 1 4 2 2 17 17 1 228 28 110 5 5 5 5 5 5 5 5 5 5 5 5 5 8 11 30 17 19 44 47 46 5 37 39 3 1 5 6 20 6 2 1 1 2 12 1 1 1 1 21 1 22 170 6 18 144 1 2 3 1 8 8 8 8 4 8 8 1 1 1 1 30 30 23 27 1 2 1 1 23 23 8 16 2 23 24 24 15 6 18 4 7 16 4 4 4 4 4 4 1 8 8 8 8 8 49 8 8 8 8 8 8 8 8 8 8 1 8 8 8 8 8 3 4 5 46 46 46 5 5 45 45 69 2 39 32 13 13 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 | // SPDX-License-Identifier: GPL-2.0 /* * Wireless utility functions * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/ip.h> #include <net/dsfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/gcd.h> #include <linux/bitfield.h> #include <linux/nospec.h> #include "core.h" #include "rdev-ops.h" const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { struct ieee80211_rate *result = &sband->bitrates[0]; int i; for (i = 0; i < sband->n_bitrates; i++) { if (!(basic_rates & BIT(i))) continue; if (sband->bitrates[i].bitrate > bitrate) continue; result = &sband->bitrates[i]; } return result; } EXPORT_SYMBOL(ieee80211_get_response_rate); u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; enum ieee80211_rate_flags mandatory_flag; int i; if (WARN_ON(!sband)) return 1; if (sband->band == NL80211_BAND_2GHZ) mandatory_flag = IEEE80211_RATE_MANDATORY_B; else mandatory_flag = IEEE80211_RATE_MANDATORY_A; bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) if (bitrates[i].flags & mandatory_flag) mandatory_rates |= BIT(i); return mandatory_rates; } EXPORT_SYMBOL(ieee80211_mandatory_rates); u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return MHZ_TO_KHZ(4000 + chan * 5); else return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D6.1 27.3.23.2 */ if (chan == 2) return MHZ_TO_KHZ(5935); if (chan <= 233) return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; case NL80211_BAND_S1GHZ: return 902000 + chan * 500; default: ; } return 0; /* not supported */ } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); enum nl80211_chan_width ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) { if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) return NL80211_CHAN_WIDTH_20_NOHT; /*S1G defines a single allowed channel width per channel. * Extract that width here. */ if (chan->flags & IEEE80211_CHAN_1MHZ) return NL80211_CHAN_WIDTH_1; else if (chan->flags & IEEE80211_CHAN_2MHZ) return NL80211_CHAN_WIDTH_2; else if (chan->flags & IEEE80211_CHAN_4MHZ) return NL80211_CHAN_WIDTH_4; else if (chan->flags & IEEE80211_CHAN_8MHZ) return NL80211_CHAN_WIDTH_8; else if (chan->flags & IEEE80211_CHAN_16MHZ) return NL80211_CHAN_WIDTH_16; pr_err("unknown channel width for channel at %dKHz?\n", ieee80211_channel_to_khz(chan)); return NL80211_CHAN_WIDTH_1; } EXPORT_SYMBOL(ieee80211_s1g_channel_width); int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ freq = KHZ_TO_MHZ(freq); /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; else if (freq < 2484) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; else if (freq < 5925) return (freq - 5000) / 5; else if (freq == 5935) return 2; else if (freq <= 45000) /* DMG band lower limit */ /* see 802.11ax D6.1 27.3.22.2 */ return (freq - 5950) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else return 0; } EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (ieee80211_channel_to_khz(chan) == freq) return chan; } } return NULL; } EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; switch (sband->band) { case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || sband->bitrates[i].bitrate == 120 || sband->bitrates[i].bitrate == 240) { sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_A; want--; } } WARN_ON(want); break; case NL80211_BAND_2GHZ: case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { case 10: case 20: case 55: case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; break; case 60: case 120: case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; fallthrough; default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; break; } } WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NL80211_BAND_S1GHZ: /* Figure 9-589bd: 3 means unsupported, so != 3 means at least * mandatory is ok. */ WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); break; case NUM_NL80211_BANDS: default: WARN_ON(1); break; } } void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { enum nl80211_band band; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) { int i; for (i = 0; i < wiphy->n_cipher_suites; i++) if (cipher == wiphy->cipher_suites[i]) return true; return false; } static bool cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev) { struct wiphy *wiphy = &rdev->wiphy; int i; for (i = 0; i < wiphy->n_cipher_suites; i++) { switch (wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return true; } } return false; } bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise) { int max_key_idx; if (pairwise) max_key_idx = 3; else if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; else if (cfg80211_igtk_cipher_supported(rdev)) max_key_idx = 5; else max_key_idx = 3; if (key_idx < 0 || key_idx > max_key_idx) return false; return true; } int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise)) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) return -EINVAL; if (pairwise && !mac_addr) return -EINVAL; switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: /* Extended Key ID can only be used with CCMP/GCMP ciphers */ if ((pairwise && key_idx) || params->mode != NL80211_KEY_RX_TX) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: /* IEEE802.11-2016 allows only 0 and - when supporting * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || params->mode == NL80211_KEY_SET_TX) return -EINVAL; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; } else if (pairwise && key_idx) { return -EINVAL; } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* Disallow BIP (group-only) cipher as pairwise cipher */ if (pairwise) return -EINVAL; if (key_idx < 4) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: if (key_idx > 3) return -EINVAL; break; default: break; } switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: if (params->key_len != WLAN_KEY_LEN_WEP40) return -EINVAL; break; case WLAN_CIPHER_SUITE_TKIP: if (params->key_len != WLAN_KEY_LEN_TKIP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: if (params->key_len != WLAN_KEY_LEN_CCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP_256: if (params->key_len != WLAN_KEY_LEN_CCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP: if (params->key_len != WLAN_KEY_LEN_GCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP_256: if (params->key_len != WLAN_KEY_LEN_GCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP104: if (params->key_len != WLAN_KEY_LEN_WEP104) return -EINVAL; break; case WLAN_CIPHER_SUITE_AES_CMAC: if (params->key_len != WLAN_KEY_LEN_AES_CMAC) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256) return -EINVAL; break; default: /* * We don't know anything about this algorithm, * allow using it -- but the driver must check * all parameters! We still check below whether * or not the driver supports this algorithm, * of course. */ break; } if (params->seq) { switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* These ciphers do not use key sequence */ return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->seq_len != 6) return -EINVAL; break; } } if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher)) return -EINVAL; return 0; } unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; if (ieee80211_is_ext(fc)) { hdrlen = 4; goto out; } if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; } goto out; } if (ieee80211_is_mgmt(fc)) { if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; goto out; } if (ieee80211_is_ctl(fc)) { /* * ACK and CTS are 10 bytes, all others 16. To see how * to get this condition consider * subtype mask: 0b0000000011110000 (0x00F0) * ACK subtype: 0b0000000011010000 (0x00D0) * CTS subtype: 0b0000000011000000 (0x00C0) * bits that matter: ^^^ (0x00E0) * value of those: 0b0000000011000000 (0x00C0) */ if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0)) hdrlen = 10; else hdrlen = 16; } out: return hdrlen; } EXPORT_SYMBOL(ieee80211_hdrlen); unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) { const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)skb->data; unsigned int hdrlen; if (unlikely(skb->len < 10)) return 0; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (unlikely(hdrlen > skb->len)) return 0; return hdrlen; } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: case 0: return 6; case MESH_FLAGS_AE_A4: return 12; case MESH_FLAGS_AE_A5_A6: return 18; } } unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) { return __ieee80211_get_mesh_hdrlen(meshhdr->flags); } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto) { const __be16 *hdr_proto = hdr + ETH_ALEN; if (!(ether_addr_equal(hdr, rfc1042_header) && *hdr_proto != htons(ETH_P_AARP) && *hdr_proto != htons(ETH_P_IPX)) && !ether_addr_equal(hdr, bridge_tunnel_header)) return false; *proto = *hdr_proto; return true; } EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto); int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) { const void *mesh_addr; struct { struct ethhdr eth; u8 flags; } payload; int hdrlen; int ret; ret = skb_copy_bits(skb, 0, &payload, sizeof(payload)); if (ret) return ret; hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags); if (likely(pskb_may_pull(skb, hdrlen + 8) && ieee80211_get_8023_tunnel_proto(skb->data + hdrlen, &payload.eth.h_proto))) hdrlen += ETH_ALEN + 2; else if (!pskb_may_pull(skb, hdrlen)) return -EINVAL; else payload.eth.h_proto = htons(skb->len - hdrlen); mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; switch (payload.flags & MESH_FLAGS_AE) { case MESH_FLAGS_AE_A4: memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN); break; case MESH_FLAGS_AE_A5_A6: memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN); break; default: break; } pskb_pull(skb, hdrlen - sizeof(payload.eth)); memcpy(skb->data, &payload.eth, sizeof(payload.eth)); return 0; } EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, u8 data_offset, bool is_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { u8 hdr[ETH_ALEN] __aligned(2); __be16 proto; } payload; struct ethhdr tmp; u16 hdrlen; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header * IEEE 802.11 address fields: * ToDS FromDS Addr1 Addr2 Addr3 Addr4 * 0 0 DA SA BSSID n/a * 0 1 DA BSSID SA n/a * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_P2P_GO)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || (is_multicast_ether_addr(tmp.h_dest) && ether_addr_equal(tmp.h_source, addr))) return -1; break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_OCB) return -1; break; } if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT && skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) { /* remove RFC1042 or Bridge-Tunnel encapsulation */ hdrlen += ETH_ALEN + 2; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); } pskb_pull(skb, hdrlen); if (!ehdr) ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr, &tmp, sizeof(tmp)); return 0; } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) { struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } static void __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, int offset, int len) { struct skb_shared_info *sh = skb_shinfo(skb); const skb_frag_t *frag = &sh->frags[0]; struct page *frag_page; void *frag_ptr; int frag_len, frag_size; int head_size = skb->len - skb->data_len; int cur_len; frag_page = virt_to_head_page(skb->head); frag_ptr = skb->data; frag_size = head_size; while (offset >= frag_size) { offset -= frag_size; frag_page = skb_frag_page(frag); frag_ptr = skb_frag_address(frag); frag_size = skb_frag_size(frag); frag++; } frag_ptr += offset; frag_len = frag_size - offset; cur_len = min(len, frag_len); __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); len -= cur_len; while (len > 0) { frag_len = skb_frag_size(frag); cur_len = min(len, frag_len); __frame_add_frag(frame, skb_frag_page(frag), skb_frag_address(frag), cur_len, frag_len); len -= cur_len; frag++; } } static struct sk_buff * __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, int offset, int len, bool reuse_frag, int min_len) { struct sk_buff *frame; int cur_len = len; if (skb->len - offset < len) return NULL; /* * When reusing framents, copy some data to the head to simplify * ethernet header handling and speed up protocol header processing * in the stack later. */ if (reuse_frag) cur_len = min_t(int, len, min_len); /* * Allocate and reserve two bytes more for payload * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); if (!frame) return NULL; frame->priority = skb->priority; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); len -= cur_len; if (!len) return frame; offset += cur_len; __ieee80211_amsdu_copy_frag(skb, frame, offset, len); return frame; } static u16 ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type) { __le16 *field_le = field; __be16 *field_be = field; u16 len; if (hdr_type >= 2) len = le16_to_cpu(*field_le); else len = be16_to_cpu(*field_be); if (hdr_type) len += __ieee80211_get_mesh_hdrlen(mesh_flags); return len; } bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) { int offset = 0, subframe_len, padding; for (offset = 0; offset < skb->len; offset += subframe_len + padding) { int remaining = skb->len - offset; struct { __be16 len; u8 mesh_flags; } hdr; u16 len; if (sizeof(hdr) > remaining) return false; if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) return false; len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags, mesh_hdr); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; if (subframe_len > remaining) return false; } return true; } EXPORT_SYMBOL(ieee80211_is_valid_amsdu); void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, const u8 *check_da, const u8 *check_sa, u8 mesh_control) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; int offset = 0; struct { struct ethhdr eth; uint8_t flags; } hdr; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; int copy_len = sizeof(hdr.eth); if (iftype == NL80211_IFTYPE_MESH_POINT) copy_len = sizeof(hdr); while (!last) { int remaining = skb->len - offset; unsigned int subframe_len; int len, mesh_len = 0; u8 padding; if (copy_len > remaining) goto purge; skb_copy_bits(skb, offset, &hdr, copy_len); if (iftype == NL80211_IFTYPE_MESH_POINT) mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags, mesh_control); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; /* the last MSDU has no padding */ if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks */ if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header)) goto purge; offset += sizeof(struct ethhdr); last = remaining <= subframe_len + padding; /* FIXME: should we really accept multicast DA? */ if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) && !ether_addr_equal(check_da, hdr.eth.h_dest)) || (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) { offset += len + padding; continue; } /* reuse skb for the last subframe */ if (!skb_is_nonlinear(skb) && !reuse_frag && last) { skb_pull(skb, offset); frame = skb; reuse_skb = true; } else { frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, reuse_frag, 32 + mesh_len); if (!frame) goto purge; offset += len + padding; } skb_reset_network_header(frame); frame->dev = skb->dev; frame->priority = skb->priority; if (likely(iftype != NL80211_IFTYPE_MESH_POINT && ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto))) skb_pull(frame, ETH_ALEN + 2); memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth)); __skb_queue_tail(list, frame); } if (!reuse_skb) dev_kfree_skb(skb); return; purge: __skb_queue_purge(list); dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); /* Given a data frame determine the 802.1p/1d tag to use. */ unsigned int cfg80211_classify8021d(struct sk_buff *skb, struct cfg80211_qos_map *qos_map) { unsigned int dscp; unsigned char vlan_priority; unsigned int ret; /* skb->priority values from 256->263 are magic values to * directly indicate a specific 802.1d priority. This is used * to allow 802.1d priority to be passed directly in from VLAN * tags, etc. */ if (skb->priority >= 256 && skb->priority <= 263) { ret = skb->priority - 256; goto out; } if (skb_vlan_tag_present(skb)) { vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; if (vlan_priority > 0) { ret = vlan_priority; goto out; } } switch (skb->protocol) { case htons(ETH_P_IP): dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc; break; case htons(ETH_P_IPV6): dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc; break; case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): { struct mpls_label mpls_tmp, *mpls; mpls = skb_header_pointer(skb, sizeof(struct ethhdr), sizeof(*mpls), &mpls_tmp); if (!mpls) return 0; ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; goto out; } case htons(ETH_P_80221): /* 802.21 is always network control traffic */ return 7; default: return 0; } if (qos_map) { unsigned int i, tmp_dscp = dscp >> 2; for (i = 0; i < qos_map->num_des; i++) { if (tmp_dscp == qos_map->dscp_exception[i].dscp) { ret = qos_map->dscp_exception[i].up; goto out; } } for (i = 0; i < 8; i++) { if (tmp_dscp >= qos_map->up[i].low && tmp_dscp <= qos_map->up[i].high) { ret = i; goto out; } } } /* The default mapping as defined Section 2.3 in RFC8325: The three * Most Significant Bits (MSBs) of the DSCP are used as the * corresponding L2 markings. */ ret = dscp >> 5; /* Handle specific DSCP values for which the default mapping (as * described above) doesn't adhere to the intended usage of the DSCP * value. See section 4 in RFC8325. Specifically, for the following * Diffserv Service Classes no update is needed: * - Standard: DF * - Low Priority Data: CS1 * - Multimedia Streaming: AF31, AF32, AF33 * - Multimedia Conferencing: AF41, AF42, AF43 * - Network Control Traffic: CS7 * - Real-Time Interactive: CS4 */ switch (dscp >> 2) { case 10: case 12: case 14: /* High throughput data: AF11, AF12, AF13 */ ret = 0; break; case 16: /* Operations, Administration, and Maintenance and Provisioning: * CS2 */ ret = 0; break; case 18: case 20: case 22: /* Low latency data: AF21, AF22, AF23 */ ret = 3; break; case 24: /* Broadcasting video: CS3 */ ret = 4; break; case 40: /* Signaling: CS5 */ ret = 5; break; case 44: /* Voice Admit: VA */ ret = 6; break; case 46: /* Telephony traffic: EF */ ret = 6; break; case 48: /* Network Control Traffic: CS6 */ ret = 7; break; } out: return array_index_nospec(ret, IEEE80211_NUM_TIDS); } EXPORT_SYMBOL(cfg80211_classify8021d); const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id) { const struct cfg80211_bss_ies *ies; ies = rcu_dereference(bss->ies); if (!ies) return NULL; return cfg80211_find_elem(id, ies->data, ies->len); } EXPORT_SYMBOL(ieee80211_bss_get_elem); void cfg80211_upload_connect_keys(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct net_device *dev = wdev->netdev; int i; if (!wdev->connect_keys) return; for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; } if (wdev->connect_keys->def == i && rdev_set_default_key(rdev, dev, -1, i, true, true)) { netdev_err(dev, "failed to set defkey %d\n", i); continue; } } kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; } void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; spin_lock_irqsave(&wdev->event_lock, flags); while (!list_empty(&wdev->event_list)) { ev = list_first_entry(&wdev->event_list, struct cfg80211_event, list); list_del(&ev->list); spin_unlock_irqrestore(&wdev->event_lock, flags); switch (ev->type) { case EVENT_CONNECT_RESULT: __cfg80211_connect_result( wdev->netdev, &ev->cr, ev->cr.status == WLAN_STATUS_SUCCESS); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, &ev->rm); break; case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, ev->dc.reason, !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, ev->ij.channel); break; case EVENT_STOPPED: cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.peer_addr, ev->pa.td_bitmap, ev->pa.td_bitmap_len); break; } kfree(ev); spin_lock_irqsave(&wdev->event_lock, flags); } spin_unlock_irqrestore(&wdev->event_lock, flags); } void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); } int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params) { int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; /* cannot change into P2P device or NAN */ if (ntype == NL80211_IFTYPE_P2P_DEVICE || ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; if (ntype != otype) { /* if it's part of a bridge, reject changing type to station/ibss */ if (netif_is_bridge_port(dev) && (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION || ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); switch (otype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; default: break; } cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); memset(&dev->ieee80211_ptr->u, 0, sizeof(dev->ieee80211_ptr->u)); memset(&dev->ieee80211_ptr->links, 0, sizeof(dev->ieee80211_ptr->links)); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); if (!err && params && params->use_4addr != -1) dev->ieee80211_ptr->use_4addr = params->use_4addr; if (!err) { dev->priv_flags &= ~IFF_DONT_BRIDGE; switch (ntype) { case NL80211_IFTYPE_STATION: if (dev->ieee80211_ptr->use_4addr) break; fallthrough; case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; break; case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; case NL80211_IFTYPE_MONITOR: /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; } } if (!err && ntype != otype && netif_running(dev)) { cfg80211_update_iface_num(rdev, ntype, 1); cfg80211_update_iface_num(rdev, otype, -1); } return err; } static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) { int modulation, streams, bitrate; /* the formula below does only work for MCS values smaller than 32 */ if (WARN_ON_ONCE(rate->mcs >= 32)) return 0; modulation = rate->mcs & 7; streams = (rate->mcs >> 3) + 1; bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; if (modulation < 4) bitrate *= (modulation + 1); else if (modulation == 4) bitrate *= (modulation + 2); else bitrate *= (modulation + 3); bitrate *= streams; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; } static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 15400, [7] = 19250, [8] = 23100, [9] = 25025, [10] = 30800, [11] = 38500, [12] = 46200, /* OFDM PHY */ [13] = 6930, [14] = 8662, /* 866.25 mbps */ [15] = 13860, [16] = 17325, [17] = 20790, [18] = 27720, [19] = 34650, [20] = 41580, [21] = 45045, [22] = 51975, [23] = 62370, [24] = 67568, /* 6756.75 mbps */ /* LP-SC PHY */ [25] = 6260, [26] = 8340, [27] = 11120, [28] = 12510, [29] = 16680, [30] = 22240, [31] = 25030, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs]; } static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ [7 - 6] = 50050, /* MCS 12.1 */ [8 - 6] = 53900, [9 - 6] = 57750, [10 - 6] = 63900, [11 - 6] = 75075, [12 - 6] = 80850, }; /* Extended SC MCS not defined for base MCS below 6 or above 12 */ if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) return 0; return __mcs2bitrate[rate->mcs - 6]; } static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 13475, [7] = 15400, [8] = 19250, [9] = 23100, [10] = 25025, [11] = 26950, [12] = 30800, [13] = 38500, [14] = 46200, [15] = 50050, [16] = 53900, [17] = 57750, [18] = 69300, [19] = 75075, [20] = 80850, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch; } static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { static const u32 base[4][12] = { { 6500000, 13000000, 19500000, 26000000, 39000000, 52000000, 58500000, 65000000, 78000000, /* not in the spec, but some devices use this: */ 86700000, 97500000, 108300000, }, { 13500000, 27000000, 40500000, 54000000, 81000000, 108000000, 121500000, 135000000, 162000000, 180000000, 202500000, 225000000, }, { 29300000, 58500000, 87800000, 117000000, 175500000, 234000000, 263300000, 292500000, 351000000, 390000000, 438800000, 487500000, }, { 58500000, 117000000, 175500000, 234000000, 351000000, 468000000, 526500000, 585000000, 702000000, 780000000, 877500000, 975000000, }, }; u32 bitrate; int idx; if (rate->mcs > 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_160: idx = 3; break; case RATE_INFO_BW_80: idx = 2; break; case RATE_INFO_BW_40: idx = 1; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: default: goto warn; case RATE_INFO_BW_20: idx = 0; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { #define SCALE 6144 u32 mcs_divisors[14] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_969[3] = { 480388888, 453700000, 408333333 }; u32 rates_484[3] = { 229411111, 216666666, 195000000 }; u32 rates_242[3] = { 114711111, 108333333, 97500000 }; u32 rates_106[3] = { 40000000, 37777777, 34000000 }; u32 rates_52[3] = { 18820000, 17777777, 16000000 }; u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->he_ru_alloc > NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; if (rate->bw == RATE_INFO_BW_160) result = rates_160M[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) result = rates_969[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) result = rates_484[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) result = rates_242[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) result = rates_106[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) result = rates_52[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; else { WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", rate->bw, rate->he_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); result = tmp; /* and take NSS, DCM into account */ result = (result * rate->nss) / 8; if (rate->he_dcm) result /= 2; return result / 10000; } static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) { #define SCALE 6144 static const u32 mcs_divisors[16] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ 409600, /* 66.666666... */ 204800, /* 33.333333... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; static const u32 rates_242[3] = { 114711111, 108333333, 97500000 }; static const u32 rates_106[3] = { 40000000, 37777777, 34000000 }; static const u32 rates_52[3] = { 18820000, 17777777, 16000000 }; static const u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 15)) return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; /* Bandwidth checks for MCS 14 */ if (rate->mcs == 14) { if ((rate->bw != RATE_INFO_BW_EHT_RU && rate->bw != RATE_INFO_BW_80 && rate->bw != RATE_INFO_BW_160 && rate->bw != RATE_INFO_BW_320) || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) { WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } } if (rate->bw == RATE_INFO_BW_320 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) result = 4 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484) result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996) result = 3 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484) result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996)) result = 2 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996)) result = rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242) result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484)) result = rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242)) result = rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26) result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106) result = rates_106[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26) result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52) result = rates_52[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); /* and take NSS */ tmp *= rate->nss; do_div(tmp, 8); result = tmp; return result / 10000; } static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ static const u32 base[5][11] = { { 300000, 600000, 900000, 1200000, 1800000, 2400000, 2700000, 3000000, 3600000, 4000000, /* MCS 10 supported in 1 MHz only */ 150000, }, { 650000, 1300000, 1950000, 2600000, 3900000, 5200000, 5850000, 6500000, 7800000, /* MCS 9 not valid */ }, { 1350000, 2700000, 4050000, 5400000, 8100000, 10800000, 12150000, 13500000, 16200000, 18000000, }, { 2925000, 5850000, 8775000, 11700000, 17550000, 23400000, 26325000, 29250000, 35100000, 39000000, }, { 8580000, 11700000, 17550000, 23400000, 35100000, 46800000, 52650000, 58500000, 70200000, 78000000, }, }; u32 bitrate; /* default is 1 MHz index */ int idx = 0; if (rate->mcs >= 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_16: idx = 4; break; case RATE_INFO_BW_8: idx = 3; break; case RATE_INFO_BW_4: idx = 2; break; case RATE_INFO_BW_2: idx = 1; break; case RATE_INFO_BW_1: idx = 0; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: case RATE_INFO_BW_20: case RATE_INFO_BW_40: case RATE_INFO_BW_80: case RATE_INFO_BW_160: default: goto warn; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); return rate->legacy; } EXPORT_SYMBOL(cfg80211_calculate_bitrate); int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, enum ieee80211_p2p_attr_id attr, u8 *buf, unsigned int bufsize) { u8 *out = buf; u16 attr_remaining = 0; bool desired_attr = false; u16 desired_len = 0; while (len > 0) { unsigned int iedatalen; unsigned int copy; const u8 *iedata; if (len < 2) return -EILSEQ; iedatalen = ies[1]; if (iedatalen + 2 > len) return -EILSEQ; if (ies[0] != WLAN_EID_VENDOR_SPECIFIC) goto cont; if (iedatalen < 4) goto cont; iedata = ies + 2; /* check WFA OUI, P2P subtype */ if (iedata[0] != 0x50 || iedata[1] != 0x6f || iedata[2] != 0x9a || iedata[3] != 0x09) goto cont; iedatalen -= 4; iedata += 4; /* check attribute continuation into this IE */ copy = min_t(unsigned int, attr_remaining, iedatalen); if (copy && desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_remaining) return desired_len; } attr_remaining -= copy; if (attr_remaining) goto cont; iedatalen -= copy; iedata += copy; while (iedatalen > 0) { u16 attr_len; /* P2P attribute ID & size must fit */ if (iedatalen < 3) return -EILSEQ; desired_attr = iedata[0] == attr; attr_len = get_unaligned_le16(iedata + 1); iedatalen -= 3; iedata += 3; copy = min_t(unsigned int, attr_len, iedatalen); if (desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_len) return desired_len; } iedata += copy; iedatalen -= copy; attr_remaining = attr_len - copy; } cont: len -= ies[1] + 2; ies += ies[1] + 2; } if (attr_remaining && desired_attr) return -EILSEQ; return -ENOENT; } EXPORT_SYMBOL(cfg80211_get_p2p_attr); static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; /* Make sure array values are legal */ if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) return false; i = 0; while (i < n_ids) { if (ids[i] == WLAN_EID_EXTENSION) { if (id_ext && (ids[i + 1] == id)) return true; i += 2; continue; } if (ids[i] == id && !id_ext) return true; i++; } return false; } static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos) { /* we assume a validly formed IEs buffer */ u8 len = ies[pos + 1]; pos += 2 + len; /* the IE itself must have 255 bytes for fragments to follow */ if (len < 255) return pos; while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) { len = ies[pos + 1]; pos += 2 + len; } return pos; } size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, const u8 *after_ric, int n_after_ric, size_t offset) { size_t pos = offset; while (pos < ielen) { u8 ext = 0; if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], ies[pos] == WLAN_EID_EXTENSION)) break; if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); while (pos < ielen) { if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; else ext = 0; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(after_ric, n_after_ric, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); else break; } } else { pos = skip_ie(ies, ielen, pos); } } return pos; } EXPORT_SYMBOL(ieee80211_ie_split_ric); void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id) { unsigned int elem_len; if (!len_pos) return; elem_len = skb->data + skb->len - len_pos - 1; while (elem_len > 255) { /* this one is 255 */ *len_pos = 255; /* remaining data gets smaller */ elem_len -= 255; /* make space for the fragment ID/len in SKB */ skb_put(skb, 2); /* shift back the remaining data to place fragment ID/len */ memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len); /* place the fragment ID */ len_pos += 255 + 1; *len_pos = frag_id; /* and point to fragment length to update later */ len_pos++; } *len_pos = elem_len; } EXPORT_SYMBOL(ieee80211_fragment_element); bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: *band = NL80211_BAND_5GHZ; return true; case 131 ... 135: case 137: *band = NL80211_BAND_6GHZ; return true; case 81: case 82: case 83: case 84: *band = NL80211_BAND_2GHZ; return true; case 180: *band = NL80211_BAND_60GHZ; return true; } return false; } EXPORT_SYMBOL(ieee80211_operating_class_to_band); bool ieee80211_operating_class_to_chandef(u8 operating_class, struct ieee80211_channel *chan, struct cfg80211_chan_def *chandef) { u32 control_freq, offset = 0; enum nl80211_band band; if (!ieee80211_operating_class_to_band(operating_class, &band) || !chan || band != chan->band) return false; control_freq = chan->center_freq; chandef->chan = chan; if (control_freq >= 5955) offset = control_freq - 5955; else if (control_freq >= 5745) offset = control_freq - 5745; else if (control_freq >= 5180) offset = control_freq - 5180; offset /= 20; switch (operating_class) { case 81: /* 2 GHz band; 20 MHz; channels 1..13 */ case 82: /* 2 GHz band; 20 MHz; channel 14 */ case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */ case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */ case 121: /* 5 GHz band; 20 MHz; channels 100..144 */ case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */ case 125: /* 5 GHz band; 20 MHz; channels 149..177 */ case 131: /* 6 GHz band; 20 MHz; channels 1..233*/ case 136: /* 6 GHz band; 20 MHz; channel 2 */ chandef->center_freq1 = control_freq; chandef->width = NL80211_CHAN_WIDTH_20; return true; case 83: /* 2 GHz band; 40 MHz; channels 1..9 */ case 116: /* 5 GHz band; 40 MHz; channels 36,44 */ case 119: /* 5 GHz band; 40 MHz; channels 52,60 */ case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */ case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */ chandef->center_freq1 = control_freq + 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 84: /* 2 GHz band; 40 MHz; channels 5..13 */ case 117: /* 5 GHz band; 40 MHz; channels 40,48 */ case 120: /* 5 GHz band; 40 MHz; channels 56,64 */ case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */ case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */ chandef->center_freq1 = control_freq - 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/ chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */ case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20; chandef->width = NL80211_CHAN_WIDTH_80; return true; case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */ case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20; chandef->width = NL80211_CHAN_WIDTH_160; return true; case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */ case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */ /* The center_freq2 of 80+80 MHz is unknown */ case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */ /* 320-1 or 320-2 channelization is unknown */ default: return false; } } EXPORT_SYMBOL(ieee80211_operating_class_to_chandef); bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { u8 vht_opclass; u32 freq = chandef->center_freq1; if (freq >= 2412 && freq <= 2472) { if (chandef->width > NL80211_CHAN_WIDTH_40) return false; /* 2.407 GHz, channels 1..13 */ if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 83; /* HT40+ */ else *op_class = 84; /* HT40- */ } else { *op_class = 81; } return true; } if (freq == 2484) { /* channel 14 is only for IEEE 802.11b */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ return true; } switch (chandef->width) { case NL80211_CHAN_WIDTH_80: vht_opclass = 128; break; case NL80211_CHAN_WIDTH_160: vht_opclass = 129; break; case NL80211_CHAN_WIDTH_80P80: vht_opclass = 130; break; case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_5: return false; /* unsupported for now */ default: vht_opclass = 0; break; } /* 5 GHz, channels 36..48 */ if (freq >= 5180 && freq <= 5240) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 116; else *op_class = 117; } else { *op_class = 115; } return true; } /* 5 GHz, channels 52..64 */ if (freq >= 5260 && freq <= 5320) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 119; else *op_class = 120; } else { *op_class = 118; } return true; } /* 5 GHz, channels 100..144 */ if (freq >= 5500 && freq <= 5720) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 122; else *op_class = 123; } else { *op_class = 121; } return true; } /* 5 GHz, channels 149..169 */ if (freq >= 5745 && freq <= 5845) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 126; else *op_class = 127; } else if (freq <= 5805) { *op_class = 124; } else { *op_class = 125; } return true; } /* 56.16 GHz, channel 1..4 */ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) { if (chandef->width >= NL80211_CHAN_WIDTH_40) return false; *op_class = 180; return true; } /* not supported yet */ return false; } EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); static int cfg80211_wdev_bi(struct wireless_dev *wdev) { switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: WARN_ON(wdev->valid_links); return wdev->links[0].ap.beacon_interval; case NL80211_IFTYPE_MESH_POINT: return wdev->u.mesh.beacon_interval; case NL80211_IFTYPE_ADHOC: return wdev->u.ibss.beacon_interval; default: break; } return 0; } static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, u32 *beacon_int_gcd, bool *beacon_int_different) { struct wireless_dev *wdev; *beacon_int_gcd = 0; *beacon_int_different = false; list_for_each_entry(wdev, &wiphy->wdev_list, list) { int wdev_bi; /* this feature isn't supported with MLO */ if (wdev->valid_links) continue; wdev_bi = cfg80211_wdev_bi(wdev); if (!wdev_bi) continue; if (!*beacon_int_gcd) { *beacon_int_gcd = wdev_bi; continue; } if (wdev_bi == *beacon_int_gcd) continue; *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi); } if (new_beacon_int && *beacon_int_gcd != new_beacon_int) { if (*beacon_int_gcd) *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int); } } int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int) { /* * This is just a basic pre-condition check; if interface combinations * are possible the driver must already be checking those with a call * to cfg80211_check_combinations(), in which case we'll validate more * through the cfg80211_calculate_bi_data() call and code in * cfg80211_iter_combinations(). */ if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; return 0; } int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data) { const struct ieee80211_regdomain *regdom; enum nl80211_dfs_regions region = 0; int i, j, iftype; int num_interfaces = 0; u32 used_iftypes = 0; u32 beacon_int_gcd; bool beacon_int_different; /* * This is a bit strange, since the iteration used to rely only on * the data given by the driver, but here it now relies on context, * in form of the currently operating interfaces. * This is OK for all current users, and saves us from having to * push the GCD calculations into all the drivers. * In the future, this should probably rely more on data that's in * cfg80211 already - the only thing not would appear to be any new * interfaces (while being brought up) and channel/radar data. */ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int, &beacon_int_gcd, &beacon_int_different); if (params->radar_detect) { rcu_read_lock(); regdom = rcu_dereference(cfg80211_regdomain); if (regdom) region = regdom->dfs_region; rcu_read_unlock(); } for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { num_interfaces += params->iftype_num[iftype]; if (params->iftype_num[iftype] > 0 && !cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) used_iftypes |= BIT(iftype); } for (i = 0; i < wiphy->n_iface_combinations; i++) { const struct ieee80211_iface_combination *c; struct ieee80211_iface_limit *limits; u32 all_iftypes = 0; c = &wiphy->iface_combinations[i]; if (num_interfaces > c->max_interfaces) continue; if (params->num_different_channels > c->num_different_channels) continue; limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, GFP_KERNEL); if (!limits) return -ENOMEM; for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) continue; for (j = 0; j < c->n_limits; j++) { all_iftypes |= limits[j].types; if (!(limits[j].types & BIT(iftype))) continue; if (limits[j].max < params->iftype_num[iftype]) goto cont; limits[j].max -= params->iftype_num[iftype]; } } if (params->radar_detect != (c->radar_detect_widths & params->radar_detect)) goto cont; if (params->radar_detect && c->radar_detect_regions && !(c->radar_detect_regions & BIT(region))) goto cont; /* Finally check that all iftypes that we're currently * using are actually part of this combination. If they * aren't then we can't use this combination and have * to continue to the next. */ if ((all_iftypes & used_iftypes) != used_iftypes) goto cont; if (beacon_int_gcd) { if (c->beacon_int_min_gcd && beacon_int_gcd < c->beacon_int_min_gcd) goto cont; if (!c->beacon_int_min_gcd && beacon_int_different) goto cont; } /* This combination covered all interface types and * supported the requested numbers, so we're good. */ (*iter)(c, data); cont: kfree(limits); } return 0; } EXPORT_SYMBOL(cfg80211_iter_combinations); static void cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c, void *data) { int *num = data; (*num)++; } int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params) { int err, num = 0; err = cfg80211_iter_combinations(wiphy, params, cfg80211_iter_sum_ifcombs, &num); if (err) return err; if (num == 0) return -EBUSY; return 0; } EXPORT_SYMBOL(cfg80211_check_combinations); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) { int i, j; if (!sband) return -EINVAL; if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES) return -EINVAL; *mask = 0; for (i = 0; i < n_rates; i++) { int rate = (rates[i] & 0x7f) * 5; bool found = false; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].bitrate == rate) { found = true; *mask |= BIT(j); break; } } if (!found) return -EINVAL; } /* * mask must have at least one bit set here since we * didn't accept a 0-length rates array nor allowed * entries in the array that didn't exist */ return 0; } unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { enum nl80211_band band; unsigned int n_channels = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; return n_channels; } EXPORT_SYMBOL(ieee80211_get_num_supported_channels); int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (!rdev->ops->get_station) return -EOPNOTSUPP; memset(sinfo, 0, sizeof(*sinfo)); return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); void cfg80211_free_nan_func(struct cfg80211_nan_func *f) { int i; if (!f) return; kfree(f->serv_spec_info); kfree(f->srf_bf); kfree(f->srf_macs); for (i = 0; i < f->num_rx_filters; i++) kfree(f->rx_filters[i].filter); for (i = 0; i < f->num_tx_filters; i++) kfree(f->tx_filters[i].filter); kfree(f->rx_filters); kfree(f->tx_filters); kfree(f); } EXPORT_SYMBOL(cfg80211_free_nan_func); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz) { u32 start_freq_khz, end_freq_khz; start_freq_khz = center_freq_khz - (bw_khz / 2); end_freq_khz = center_freq_khz + (bw_khz / 2); if (start_freq_khz >= freq_range->start_freq_khz && end_freq_khz <= freq_range->end_freq_khz) return true; return false; } int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, sizeof(*(sinfo->pertid)), gfp); if (!sinfo->pertid) return -ENOMEM; return 0; } EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats); /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; EXPORT_SYMBOL(rfc1042_header); /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); /* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ struct iapp_layer2_update { u8 da[ETH_ALEN]; /* broadcast */ u8 sa[ETH_ALEN]; /* STA addr */ __be16 len; /* 6 */ u8 dsap; /* 0 */ u8 ssap; /* 0 */ u8 control; u8 xid_info[3]; } __packed; void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) { struct iapp_layer2_update *msg; struct sk_buff *skb; /* Send Level 2 Update Frame to update forwarding tables in layer 2 * bridge devices */ skb = dev_alloc_skb(sizeof(*msg)); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ eth_broadcast_addr(msg->da); ether_addr_copy(msg->sa, addr); msg->len = htons(6); msg->dsap = 0; msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ msg->control = 0xaf; /* XID response lsb.1111F101. * F=0 (no poll command; unsolicited frame) */ msg->xid_info[0] = 0x81; /* XID format identifier */ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); int ext_nss_bw; int supp_width; int i, mcs_encoding; if (map == 0xffff) return 0; if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; else if (mcs == 8) mcs_encoding = 1; else mcs_encoding = 2; if (!max_vht_nss) { /* find max_vht_nss for the given MCS */ for (i = 7; i >= 0; i--) { int supp = (map >> (2 * i)) & 3; if (supp == 3) continue; if (supp >= mcs_encoding) { max_vht_nss = i + 1; break; } } } if (!(cap->supp_mcs.tx_mcs_map & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) return max_vht_nss; ext_nss_bw = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); supp_width = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); /* if not capable, treat ext_nss_bw as 0 */ if (!ext_nss_bw_capable) ext_nss_bw = 0; /* This is invalid */ if (supp_width == 3) return 0; /* This is an invalid combination so pretend nothing is supported */ if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return 0; /* * Cover all the special cases according to IEEE 802.11-2016 * Table 9-250. All other cases are either factor of 1 or not * valid/supported. */ switch (bw) { case IEEE80211_VHT_CHANWIDTH_USE_HT: case IEEE80211_VHT_CHANWIDTH_80MHZ: if ((supp_width == 1 || supp_width == 2) && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_160MHZ: if (supp_width == 0 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: if (supp_width == 0 && ext_nss_bw == 1) return 0; /* not possible */ if (supp_width == 0 && ext_nss_bw == 2) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 0) return 0; /* not possible */ if (supp_width == 1 && ext_nss_bw == 1) return max_vht_nss / 2; if (supp_width == 1 && ext_nss_bw == 2) return (3 * max_vht_nss) / 4; break; } /* not covered or invalid combination received */ return max_vht_nss; } EXPORT_SYMBOL(ieee80211_get_vht_max_nss); bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif) { bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN; switch (check_swif) { case 0: if (is_vlan && is_4addr) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->interface_modes & BIT(iftype); case 1: if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->software_iftypes & BIT(iftype); default: break; } return false; } EXPORT_SYMBOL(cfg80211_iftype_allowed); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_wiphy(wdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, link_id, true); break; default: /* per-link not relevant */ break; } wdev->valid_links &= ~BIT(link_id); rdev_del_intf_link(rdev, wdev, link_id); eth_zero_addr(wdev->links[link_id].addr); } void cfg80211_remove_links(struct wireless_dev *wdev) { unsigned int link_id; /* * links are controlled by upper layers (userspace/cfg) * only for AP mode, so only remove them here for AP */ if (wdev->iftype != NL80211_IFTYPE_AP) return; if (wdev->valid_links) { for_each_valid_link(wdev, link_id) cfg80211_remove_link(wdev, link_id); } } int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { cfg80211_remove_links(wdev); return rdev_del_virtual_intf(rdev, wdev); } const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) { int i; for (i = 0; i < wiphy->num_iftype_ext_capab; i++) { if (wiphy->iftype_ext_capab[i].iftype == type) return &wiphy->iftype_ext_capab[i]; } return NULL; } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); |
568 368 1244 47 19 4 19 729 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */ |
238 239 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 | // SPDX-License-Identifier: GPL-2.0-only /* * x86 APERF/MPERF KHz calculation for * /sys/.../cpufreq/scaling_cur_freq * * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/sched/isolation.h> #include <linux/sched/topology.h> #include <linux/smp.h> #include <linux/syscore_ops.h> #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "cpu.h" struct aperfmperf { seqcount_t seq; unsigned long last_update; u64 acnt; u64 mcnt; u64 aperf; u64 mperf; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; static void init_counter_refs(void) { u64 aperf, mperf; rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); } #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* * APERF/MPERF frequency ratio computation. * * The scheduler wants to do frequency invariant accounting and needs a <1 * ratio to account for the 'current' frequency, corresponding to * freq_curr / freq_max. * * Since the frequency freq_curr on x86 is controlled by micro-controller and * our P-state setting is little more than a request/hint, we need to observe * the effective frequency 'BusyMHz', i.e. the average frequency over a time * interval after discarding idle time. This is given by: * * BusyMHz = delta_APERF / delta_MPERF * freq_base * * where freq_base is the max non-turbo P-state. * * The freq_max term has to be set to a somewhat arbitrary value, because we * can't know which turbo states will be available at a given point in time: * it all depends on the thermal headroom of the entire package. We set it to * the turbo level with 4 cores active. * * Benchmarks show that's a good compromise between the 1C turbo ratio * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, * which would ignore the entire turbo range (a conspicuous part, making * freq_curr/freq_max always maxed out). * * An exception to the heuristic above is the Atom uarch, where we choose the * highest turbo level for freq_max since Atom's are generally oriented towards * power efficiency. * * Setting freq_max to anything less than the 1C turbo ratio makes the ratio * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; void arch_set_max_freq_ratio(bool turbo_disabled) { arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : arch_turbo_freq_ratio; } EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); static bool __init turbo_disabled(void) { u64 misc_en; int err; err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); } static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ return true; } #define X86_MATCH(vfm) \ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_XEON_PHI_KNL), X86_MATCH(INTEL_XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { X86_MATCH(INTEL_ATOM_GOLDMONT), X86_MATCH(INTEL_ATOM_GOLDMONT_D), X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), {} }; static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int num_delta_fratio) { int fratio, delta_fratio, found; int err, i; u64 msr; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; fratio = (msr >> 8) & 0xFF; i = 16; found = 0; do { if (found >= num_delta_fratio) { *turbo_freq = fratio; return true; } delta_fratio = (msr >> (i + 5)) & 0x7; if (delta_fratio) { found += 1; fratio -= delta_fratio; } i += 8; } while (i < 64); return true; } static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) { u64 ratios, counts; u32 group_size; int err, i; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; for (i = 0; i < 64; i += 8) { group_size = (counts >> i) & 0xFF; if (group_size >= size) { *turbo_freq = (ratios >> i) & 0xFF; return true; } } return false; } static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { u64 msr; int err; err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ /* The CPU may have less than 4 cores */ if (!*turbo_freq) *turbo_freq = msr & 0xFF; /* 1C turbo */ return true; } static bool __init intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; if (x86_match_cpu(has_glm_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_knl_turbo_ratio_limits) && knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) goto out; if (x86_match_cpu(has_skx_turbo_ratio_limits) && skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) goto out; if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; return false; out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. * Some CPUs have turbo boost but don't declare any turbo ratio * in MSR_TURBO_RATIO_LIMIT. */ if (!base_freq || !turbo_freq) { pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); if (!turbo_ratio) { pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); return false; } arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); return true; } #ifdef CONFIG_PM_SLEEP static struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, }; static void register_freq_invariance_syscore_ops(void) { register_syscore_ops(&freq_invariance_syscore_ops); } #else static inline void register_freq_invariance_syscore_ops(void) {} #endif static void freq_invariance_enable(void) { if (static_branch_unlikely(&arch_scale_freq_key)) { WARN_ON_ONCE(1); return; } static_branch_enable(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { arch_turbo_freq_ratio = ratio; arch_set_max_freq_ratio(turbo_disabled); freq_invariance_enable(); } static void __init bp_init_freq_invariance(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; if (intel_set_max_freq_ratio()) freq_invariance_enable(); } static void disable_freq_invariance_workfn(struct work_struct *work) { int cpu; static_branch_disable(&arch_scale_freq_key); /* * Set arch_freq_scale to a default value on all cpus * This negates the effect of scaling */ for_each_possible_cpu(cpu) per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; } static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; static void scale_freq_tick(u64 acnt, u64 mcnt) { u64 freq_scale; if (!arch_scale_freq_invariant()) return; if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); if (!freq_scale) goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); return; error: pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); schedule_work(&disable_freq_invariance_work); } #else static inline void bp_init_freq_invariance(void) { } static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } #endif /* CONFIG_X86_64 && CONFIG_SMP */ void arch_scale_freq_tick(void) { struct aperfmperf *s = this_cpu_ptr(&cpu_samples); u64 acnt, mcnt, aperf, mperf; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; s->aperf = aperf; s->mperf = mperf; raw_write_seqcount_begin(&s->seq); s->last_update = jiffies; s->acnt = acnt; s->mcnt = mcnt; raw_write_seqcount_end(&s->seq); scale_freq_tick(acnt, mcnt); } /* * Discard samples older than the define maximum sample age of 20ms. There * is no point in sending IPIs in such a case. If the scheduler tick was * not running then the CPU is either idle or isolated. */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) unsigned int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; unsigned long last; u64 acnt, mcnt; if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) goto fallback; do { seq = raw_read_seqcount_begin(&s->seq); last = s->last_update; acnt = s->acnt; mcnt = s->mcnt; } while (read_seqcount_retry(&s->seq, seq)); /* * Bail on invalid count and when the last update was too long ago, * which covers idle and NOHZ full CPUs. */ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) goto fallback; return div64_u64((cpu_khz * acnt), mcnt); fallback: freq = cpufreq_quick_get(cpu); return freq ? freq : cpu_khz; } static int __init bp_init_aperfmperf(void) { if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return 0; init_counter_refs(); bp_init_freq_invariance(); return 0; } early_initcall(bp_init_aperfmperf); void ap_init_aperfmperf(void) { if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) init_counter_refs(); } |
114 113 113 96 83 96 96 96 95 24 22 56 24 24 99 98 72 99 96 97 96 68 96 68 95 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Device management routines * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/slab.h> #include <linux/time.h> #include <linux/export.h> #include <linux/errno.h> #include <sound/core.h> /** * snd_device_new - create an ALSA device component * @card: the card instance * @type: the device type, SNDRV_DEV_XXX * @device_data: the data pointer of this device * @ops: the operator table * * Creates a new device component for the given data pointer. * The device will be assigned to the card and managed together * by the card. * * The data pointer plays a role as the identifier, too, so the * pointer address must be unique and unchanged. * * Return: Zero if successful, or a negative error code on failure. */ int snd_device_new(struct snd_card *card, enum snd_device_type type, void *device_data, const struct snd_device_ops *ops) { struct snd_device *dev; struct list_head *p; if (snd_BUG_ON(!card || !device_data || !ops)) return -ENXIO; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; INIT_LIST_HEAD(&dev->list); dev->card = card; dev->type = type; dev->state = SNDRV_DEV_BUILD; dev->device_data = device_data; dev->ops = ops; /* insert the entry in an incrementally sorted list */ list_for_each_prev(p, &card->devices) { struct snd_device *pdev = list_entry(p, struct snd_device, list); if ((unsigned int)pdev->type <= (unsigned int)type) break; } list_add(&dev->list, p); return 0; } EXPORT_SYMBOL(snd_device_new); static void __snd_device_disconnect(struct snd_device *dev) { if (dev->state == SNDRV_DEV_REGISTERED) { if (dev->ops->dev_disconnect && dev->ops->dev_disconnect(dev)) dev_err(dev->card->dev, "device disconnect failure\n"); dev->state = SNDRV_DEV_DISCONNECTED; } } static void __snd_device_free(struct snd_device *dev) { /* unlink */ list_del(&dev->list); __snd_device_disconnect(dev); if (dev->ops->dev_free) { if (dev->ops->dev_free(dev)) dev_err(dev->card->dev, "device free failure\n"); } kfree(dev); } static struct snd_device *look_for_dev(struct snd_card *card, void *device_data) { struct snd_device *dev; list_for_each_entry(dev, &card->devices, list) if (dev->device_data == device_data) return dev; return NULL; } /** * snd_device_disconnect - disconnect the device * @card: the card instance * @device_data: the data pointer to disconnect * * Turns the device into the disconnection state, invoking * dev_disconnect callback, if the device was already registered. * * Usually called from snd_card_disconnect(). * * Return: Zero if successful, or a negative error code on failure or if the * device not found. */ void snd_device_disconnect(struct snd_card *card, void *device_data) { struct snd_device *dev; if (snd_BUG_ON(!card || !device_data)) return; dev = look_for_dev(card, device_data); if (dev) __snd_device_disconnect(dev); else dev_dbg(card->dev, "device disconnect %p (from %pS), not found\n", device_data, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(snd_device_disconnect); /** * snd_device_free - release the device from the card * @card: the card instance * @device_data: the data pointer to release * * Removes the device from the list on the card and invokes the * callbacks, dev_disconnect and dev_free, corresponding to the state. * Then release the device. */ void snd_device_free(struct snd_card *card, void *device_data) { struct snd_device *dev; if (snd_BUG_ON(!card || !device_data)) return; dev = look_for_dev(card, device_data); if (dev) __snd_device_free(dev); else dev_dbg(card->dev, "device free %p (from %pS), not found\n", device_data, __builtin_return_address(0)); } EXPORT_SYMBOL(snd_device_free); static int __snd_device_register(struct snd_device *dev) { if (dev->state == SNDRV_DEV_BUILD) { if (dev->ops->dev_register) { int err = dev->ops->dev_register(dev); if (err < 0) return err; } dev->state = SNDRV_DEV_REGISTERED; } return 0; } /** * snd_device_register - register the device * @card: the card instance * @device_data: the data pointer to register * * Registers the device which was already created via * snd_device_new(). Usually this is called from snd_card_register(), * but it can be called later if any new devices are created after * invocation of snd_card_register(). * * Return: Zero if successful, or a negative error code on failure or if the * device not found. */ int snd_device_register(struct snd_card *card, void *device_data) { struct snd_device *dev; if (snd_BUG_ON(!card || !device_data)) return -ENXIO; dev = look_for_dev(card, device_data); if (dev) return __snd_device_register(dev); snd_BUG(); return -ENXIO; } EXPORT_SYMBOL(snd_device_register); /* * register all the devices on the card. * called from init.c */ int snd_device_register_all(struct snd_card *card) { struct snd_device *dev; int err; if (snd_BUG_ON(!card)) return -ENXIO; list_for_each_entry(dev, &card->devices, list) { err = __snd_device_register(dev); if (err < 0) return err; } return 0; } /* * disconnect all the devices on the card. * called from init.c */ void snd_device_disconnect_all(struct snd_card *card) { struct snd_device *dev; if (snd_BUG_ON(!card)) return; list_for_each_entry_reverse(dev, &card->devices, list) __snd_device_disconnect(dev); } /* * release all the devices on the card. * called from init.c */ void snd_device_free_all(struct snd_card *card) { struct snd_device *dev, *next; if (snd_BUG_ON(!card)) return; list_for_each_entry_safe_reverse(dev, next, &card->devices, list) { /* exception: free ctl and lowlevel stuff later */ if (dev->type == SNDRV_DEV_CONTROL || dev->type == SNDRV_DEV_LOWLEVEL) continue; __snd_device_free(dev); } /* free all */ list_for_each_entry_safe_reverse(dev, next, &card->devices, list) __snd_device_free(dev); } /** * snd_device_get_state - Get the current state of the given device * @card: the card instance * @device_data: the data pointer to release * * Returns the current state of the given device object. For the valid * device, either @SNDRV_DEV_BUILD, @SNDRV_DEV_REGISTERED or * @SNDRV_DEV_DISCONNECTED is returned. * Or for a non-existing device, -1 is returned as an error. * * Return: the current state, or -1 if not found */ int snd_device_get_state(struct snd_card *card, void *device_data) { struct snd_device *dev; dev = look_for_dev(card, device_data); if (dev) return dev->state; return -1; } EXPORT_SYMBOL_GPL(snd_device_get_state); |
13 924 924 13 935 1 4153 3512 933 921 13 934 8 8 8 932 935 934 20 936 896 900 4155 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | // SPDX-License-Identifier: GPL-2.0 /* * A fast, small, non-recursive O(n log n) sort for the Linux kernel * * This performs n*log2(n) + 0.37*n + o(n) comparisons on average, * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case. * * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n * better) at the expense of stack usage and much larger code to avoid * quicksort's O(n^2) worst case. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/export.h> #include <linux/sort.h> /** * is_aligned - is this pointer & size okay for word-wide copying? * @base: pointer to data * @size: size of each element * @align: required alignment (typically 4 or 8) * * Returns true if elements can be copied using word loads and stores. * The size must be a multiple of the alignment, and the base address must * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. * * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" * to "if ((a | b) & mask)", so we do that by hand. */ __attribute_const__ __always_inline static bool is_aligned(const void *base, size_t size, unsigned char align) { unsigned char lsbits = (unsigned char)size; (void)base; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS lsbits |= (unsigned char)(uintptr_t)base; #endif return (lsbits & (align - 1)) == 0; } /** * swap_words_32 - swap two elements in 32-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 4) * * Exchange the two objects in memory. This exploits base+index addressing, * which basically all CPUs have, to minimize loop overhead computations. * * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the * bottom of the loop, even though the zero flag is still valid from the * subtract (since the intervening mov instructions don't alter the flags). * Gcc 8.1.0 doesn't have that problem. */ static void swap_words_32(void *a, void *b, size_t n) { do { u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; } while (n); } /** * swap_words_64 - swap two elements in 64-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 8) * * Exchange the two objects in memory. This exploits base+index * addressing, which basically all CPUs have, to minimize loop overhead * computations. * * We'd like to use 64-bit loads if possible. If they're not, emulating * one requires base+index+4 addressing which x86 has but most other * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, * but it's possible to have 64-bit loads without 64-bit pointers (e.g. * x32 ABI). Are there any cases the kernel needs to worry about? */ static void swap_words_64(void *a, void *b, size_t n) { do { #ifdef CONFIG_64BIT u64 t = *(u64 *)(a + (n -= 8)); *(u64 *)(a + n) = *(u64 *)(b + n); *(u64 *)(b + n) = t; #else /* Use two 32-bit transfers to avoid base+index+4 addressing */ u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; #endif } while (n); } /** * swap_bytes - swap two elements a byte at a time * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size * * This is the fallback if alignment doesn't allow using larger chunks. */ static void swap_bytes(void *a, void *b, size_t n) { do { char t = ((char *)a)[--n]; ((char *)a)[n] = ((char *)b)[n]; ((char *)b)[n] = t; } while (n); } /* * The values are arbitrary as long as they can't be confused with * a pointer, but small integers make for the smallest compare * instructions. */ #define SWAP_WORDS_64 (swap_r_func_t)0 #define SWAP_WORDS_32 (swap_r_func_t)1 #define SWAP_BYTES (swap_r_func_t)2 #define SWAP_WRAPPER (swap_r_func_t)3 struct wrapper { cmp_func_t cmp; swap_func_t swap; }; /* * The function pointer is last to make tail calls most efficient if the * compiler decides not to inline this function. */ static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { ((const struct wrapper *)priv)->swap(a, b, (int)size); return; } if (swap_func == SWAP_WORDS_64) swap_words_64(a, b, size); else if (swap_func == SWAP_WORDS_32) swap_words_32(a, b, size); else if (swap_func == SWAP_BYTES) swap_bytes(a, b, size); else swap_func(a, b, (int)size, priv); } #define _CMP_WRAPPER ((cmp_r_func_t)0L) static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) { if (cmp == _CMP_WRAPPER) return ((const struct wrapper *)priv)->cmp(a, b); return cmp(a, b, priv); } /** * parent - given the offset of the child, find the offset of the parent. * @i: the offset of the heap element whose parent is sought. Non-zero. * @lsbit: a precomputed 1-bit mask, equal to "size & -size" * @size: size of each element * * In terms of array indexes, the parent of element j = @i/@size is simply * (j-1)/2. But when working in byte offsets, we can't use implicit * truncation of integer divides. * * Fortunately, we only need one bit of the quotient, not the full divide. * @size has a least significant bit. That bit will be clear if @i is * an even multiple of @size, and set if it's an odd multiple. * * Logically, we're doing "if (i & lsbit) i -= size;", but since the * branch is unpredictable, it's done with a bit of clever branch-free * code instead. */ __attribute_const__ __always_inline static size_t parent(size_t i, unsigned int lsbit, size_t size) { i -= size; i -= size & -(i & lsbit); return i / 2; } /** * sort_r - sort an array of elements * @base: pointer to data to sort * @num: number of elements * @size: size of each element * @cmp_func: pointer to comparison function * @swap_func: pointer to swap function or NULL * @priv: third argument passed to comparison function * * This function does a heapsort on the given array. You may provide * a swap_func function if you need to do something more than a memory * copy (e.g. fix up pointers or auxiliary data), but the built-in swap * avoids a slow retpoline and so is significantly faster. * * Sorting time is O(n log n) both on average and worst-case. While * quicksort is slightly faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make * it less suitable for kernel use. */ void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, const void *priv) { /* pre-scale counters for performance */ size_t n = num * size, a = (num/2) * size; const unsigned int lsbit = size & -size; /* Used to find parent */ size_t shift = 0; if (!a) /* num < 2 || size == 0 */ return; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) swap_func = NULL; if (!swap_func) { if (is_aligned(base, size, 8)) swap_func = SWAP_WORDS_64; else if (is_aligned(base, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* * Loop invariants: * 1. elements [a,n) satisfy the heap property (compare greater than * all of their children), * 2. elements [n,num*size) are sorted, and * 3. a <= b <= c <= d <= n (whenever they are valid). */ for (;;) { size_t b, c, d; if (a) /* Building heap: sift down a */ a -= size << shift; else if (n > 3 * size) { /* Sorting: Extract two largest elements */ n -= size; do_swap(base, base + n, size, swap_func, priv); shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0; a = size << shift; n -= size; do_swap(base + a, base + n, size, swap_func, priv); } else if (n > size) { /* Sorting: Extract root */ n -= size; do_swap(base, base + n, size, swap_func, priv); } else { /* Sort complete */ break; } /* * Sift element at "a" down into heap. This is the * "bottom-up" variant, which significantly reduces * calls to cmp_func(): we find the sift-down path all * the way to the leaves (one compare per level), then * backtrack to find where to insert the target element. * * Because elements tend to sift down close to the leaves, * this uses fewer compares than doing two per level * on the way down. (A bit more than half as many on * average, 3/4 worst-case.) */ for (b = a; c = 2*b + size, (d = c + size) < n;) b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d; if (d == n) /* Special case last leaf with no sibling */ b = c; /* Now backtrack from "b" to the correct location for "a" */ while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0) b = parent(b, lsbit, size); c = b; /* Where "a" belongs */ while (b != a) { /* Shift it into place */ b = parent(b, lsbit, size); do_swap(base + b, base + c, size, swap_func, priv); } } } EXPORT_SYMBOL(sort_r); void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) { struct wrapper w = { .cmp = cmp_func, .swap = swap_func, }; return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } EXPORT_SYMBOL(sort); |
20605 1625 20102 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fault-inject.h> #include <linux/mm.h> static struct { struct fault_attr attr; bool ignore_gfp_highmem; bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) { return setup_fault_attr(&fail_page_alloc.attr, str); } __setup("fail_page_alloc=", setup_fail_page_alloc); bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { int flags = 0; if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; if (fail_page_alloc.ignore_gfp_reclaim && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) flags |= FAULT_NOWARN; return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_page_alloc_debugfs(void) { umode_t mode = S_IFREG | 0600; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); debugfs_create_bool("ignore-gfp-wait", mode, dir, &fail_page_alloc.ignore_gfp_reclaim); debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; } late_initcall(fail_page_alloc_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
39 12 141 42 29 73 28 68 12 18 49 15 24 7 7 3 23 24 13 33 13 33 33 33 8 49 48 48 27 27 39 25 33 33 12 33 49 2 49 49 48 15 38 38 17 28 5 5 21 16 1 1 37 5 37 29 4 17 26 19 7 61 4 61 31 5 59 35 8 33 1 27 4 27 21 6 18 33 6 28 48 48 6 45 27 27 6 27 26 1 26 25 2 14 14 14 14 14 14 14 14 14 14 14 65 123 28 1 76 77 76 61 74 59 16 75 102 102 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 | /* * net/tipc/name_table.c: TIPC name table code * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include <linux/list_sort.h> #include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" #include "name_distr.h" #include "subscr.h" #include "bcast.h" #include "addr.h" #include "node.h" #include "group.h" /** * struct service_range - container for all bindings of a service range * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm */ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; u32 max; struct list_head local_publ; struct list_head all_publ; }; /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ struct tipc_service { u32 type; u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; #define service_range_upper(sr) ((sr)->upper) RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, struct service_range, tree_node, u32, max, service_range_upper) #define service_range_entry(rbtree_node) \ (container_of(rbtree_node, struct service_range, tree_node)) #define service_range_overlap(sr, start, end) \ ((sr)->lower <= (end) && (sr)->upper >= (start)) /** * service_range_foreach_match - iterate over tipc service rbtree for each * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ start, \ end); \ sr; \ sr = service_range_match_next(&(sr)->tree_node, \ start, \ end)) /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_first(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *l, *r; /* Non overlaps in tree at all? */ if (!n || service_range_entry(n)->max < start) return NULL; while (n) { l = n->rb_left; if (l && service_range_entry(l)->max >= start) { /* A leftmost overlap range node must be one in the left * subtree. If not, it has lower > end, then nodes on * the right side cannot satisfy the condition either. */ n = l; continue; } /* No one in the left subtree can match, return if this node is * an overlap i.e. leftmost. */ sr = service_range_entry(n); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup on the right side */ r = n->rb_right; if (sr->lower <= end && r && service_range_entry(r)->max >= start) { n = r; continue; } break; } return NULL; } /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_next(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *p, *r; while (n) { r = n->rb_right; if (r && service_range_entry(r)->max >= start) /* A next overlap range node must be one in the right * subtree. If not, it has lower > end, then any next * successor (- an ancestor) of this node cannot * satisfy the condition either. */ return service_range_match_first(r, start, end); /* No one in the right subtree can match, go up to find an * ancestor of this node which is parent of a left-hand child. */ while ((p = rb_parent(n)) && n == p->rb_right) n = p; if (!p) break; /* Return if this ancestor is an overlap */ sr = service_range_entry(p); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup more from this ancestor */ if (sr->lower <= end) { n = p; continue; } break; } return NULL; } static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); } /** * tipc_publ_create - create a publication structure * @ua: the service range the user is binding to * @sk: the address of the socket that is bound * @key: publication key */ static struct publication *tipc_publ_create(struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return NULL; p->sr = ua->sr; p->sk = *sk; p->scope = ua->scope; p->key = key; INIT_LIST_HEAD(&p->binding_sock); INIT_LIST_HEAD(&p->binding_node); INIT_LIST_HEAD(&p->local_publ); INIT_LIST_HEAD(&p->all_publ); INIT_LIST_HEAD(&p->list); return p; } /** * tipc_service_create - create a service structure for the specified 'type' * @net: network namespace * @ua: address representing the service to be bound * * Allocates a single range structure and sets it to all 0's. */ static struct tipc_service *tipc_service_create(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct tipc_service *service; struct hlist_head *hd; service = kzalloc(sizeof(*service), GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; } spin_lock_init(&service->lock); service->type = ua->sr.type; service->ranges = RB_ROOT; INIT_HLIST_NODE(&service->service_list); INIT_LIST_HEAD(&service->subscriptions); hd = &nt->services[hash(ua->sr.type)]; hlist_add_head_rcu(&service->service_list, hd); return service; } /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, struct tipc_uaddr *ua) { struct service_range *sr; service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { /* Look for exact match */ if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper) return sr; } return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, struct publication *p) { struct rb_node **n, *parent = NULL; struct service_range *sr; u32 lower = p->sr.lower; u32 upper = p->sr.upper; n = &sc->ranges.rb_node; while (*n) { parent = *n; sr = service_range_entry(parent); if (lower == sr->lower && upper == sr->upper) return sr; if (sr->max < upper) sr->max = upper; if (lower <= sr->lower) n = &parent->rb_left; else n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } static bool tipc_service_insert_publ(struct net *net, struct tipc_service *sc, struct publication *p) { struct tipc_subscription *sub, *tmp; struct service_range *sr; struct publication *_p; u32 node = p->sk.node; bool first = false; bool res = false; u32 key = p->key; spin_lock_bh(&sc->lock); sr = tipc_service_create_range(sc, p); if (!sr) goto exit; first = list_empty(&sr->all_publ); /* Return if the publication already exists */ list_for_each_entry(_p, &sr->all_publ, all_publ) { if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", p->sr.type, p->sr.lower, p->sr.upper, node, p->sk.ref, key); goto exit; } } if (in_own_node(net, p->sk.node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); p->id = sc->publ_cnt++; /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first); } res = true; exit: if (!res) pr_warn("Failed to bind to %u,%u,%u\n", p->sr.type, p->sr.lower, p->sr.upper); spin_unlock_bh(&sc->lock); return res; } /** * tipc_service_remove_publ - remove a publication from a service * @r: service_range to remove publication from * @sk: address publishing socket * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *r, struct tipc_socket_addr *sk, u32 key) { struct publication *p; u32 node = sk->node; list_for_each_entry(p, &r->all_publ, all_publ) { if (p->key != key || (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); return p; } return NULL; } /* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) static int tipc_publ_sort(void *priv, const struct list_head *a, const struct list_head *b) { struct publication *pa, *pb; pa = container_of(a, struct publication, list); pb = container_of(b, struct publication, list); return publication_after(pa, pb); } /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range * @service: the tipc_service to attach the @sub to * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; u32 filter, lower, upper; filter = sub->s.filter; lower = sub->s.seq.lower; upper = sub->s.seq.upper; tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); if (filter & TIPC_SUB_NO_STATUS) return; INIT_LIST_HEAD(&publ_list); service_range_foreach_match(sr, service, lower, upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) list_add_tail(&p->list, &publ_list); else if (!first || publication_after(first, p)) /* Pick this range's *first* publication */ first = p; } if (first) list_add_tail(&first->list, &publ_list); } /* Sort the publications before reporting */ list_sort(NULL, &publ_list, tipc_publ_sort); list_for_each_entry_safe(p, tmp, &publ_list, list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true); list_del_init(&p->list); } } static struct tipc_service *tipc_service_find(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct hlist_head *service_head; struct tipc_service *service; service_head = &nt->services[hash(ua->sr.type)]; hlist_for_each_entry_rcu(service, service_head, service_list) { if (service->type == ua->sr.type) return service; } return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_service *sc; struct publication *p; p = tipc_publ_create(ua, sk, key); if (!p) return NULL; sc = tipc_service_find(net, ua); if (!sc) sc = tipc_service_create(net, ua); if (sc && tipc_service_insert_publ(net, sc, p)) return p; kfree(p); return NULL; } struct publication *tipc_nametbl_remove_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_subscription *sub, *tmp; struct publication *p = NULL; struct service_range *sr; struct tipc_service *sc; bool last; sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); sr = tipc_service_find_range(sc, ua); if (!sr) goto unlock; p = tipc_service_remove_publ(sr, sk, key); if (!p) goto unlock; /* Notify any waiting subscriptions */ last = list_empty(&sr->all_publ); list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last); } /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } unlock: spin_unlock_bh(&sc->lock); exit: if (!p) { pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n", ua->sr.type, ua->sr.lower, ua->sr.upper, sk->node, sk->ref, key); } return p; } /** * tipc_nametbl_lookup_anycast - perform service instance to socket translation * @net: network namespace * @ua: service address to look up * @sk: address to socket we want to find * * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be * performed, which may not be this one. * * On exit: * * - If lookup is deferred to another node, leave 'sk->node' unchanged and * return 'true'. * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which * represent the bound socket and return 'true'. * - If lookup fails, return 'false' * * Note that for legacy users (node configured with Z.C.N address format) the * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0 * we must look in the local binding list first */ bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *r; struct tipc_service *sc; struct publication *p; struct list_head *l; bool res = false; if (!tipc_in_scope(legacy, sk->node, self)) return true; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(r, sc, inst, inst) { /* Select lookup algo: local, closest-first or round-robin */ if (sk->node == self) { l = &r->local_publ; if (list_empty(l)) continue; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else if (legacy && !sk->node && !list_empty(&r->local_publ)) { l = &r->local_publ; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else { l = &r->all_publ; p = list_first_entry(l, struct publication, all_publ); list_move_tail(&p->all_publ, &r->all_publ); } *sk = p->sk; res = true; /* Todo: as for legacy, pick the first matching range only, a * "true" round-robin will be performed as needed. */ break; } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return res; } /* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group * Returns a list of one (== group anycast) or more (== group multicast) * destination socket/node pairs matching the given address. * The requester may or may not want to exclude himself from the list. */ bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, struct list_head *dsts, int *dstcnt, u32 exclude, bool mcast) { u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *sr; struct tipc_service *sc; struct publication *p; *dstcnt = 0; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); /* Todo: a full search i.e. service_range_foreach_match() instead? */ sr = service_range_match_first(sc->ranges.rb_node, inst, inst); if (!sr) goto no_match; list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; if (p->sk.ref == exclude && p->sk.node == self) continue; tipc_dest_push(dsts, p->sk.node, p->sk.ref); (*dstcnt)++; if (mcast) continue; list_move_tail(&p->all_publ, &sr->all_publ); break; } no_match: spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); } /* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets * matching the given address * Used on nodes which have received a multicast/broadcast message * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching * the given address. Used in sending node. * Used on nodes which are sending out a multicast/broadcast message * Returns a list of nodes, including own node if applicable */ void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes) { struct service_range *sr; struct tipc_service *sc; struct publication *p; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->sk.node); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, struct tipc_uaddr *ua) { struct service_range *sr; struct tipc_service *sc; struct publication *p; struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; tipc_group_add_member(grp, p->sk.node, p->sk.ref, p->sr.lower); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); if (nt->local_publ_count >= TIPC_MAX_PUBL) { pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); goto exit; } p = tipc_nametbl_insert_publ(net, ua, sk, key); if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); } rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); return p; } /** * tipc_nametbl_withdraw - withdraw a service binding * @net: network namespace * @ua: service address/range being unbound * @sk: address of the socket being unbound from * @key: target publication key */ void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff *skb = NULL; struct publication *p; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, ua, sk, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); } rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); } /** * tipc_nametbl_subscribe - add a subscription object to the name table * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); u32 type = sub->s.seq.type; struct tipc_service *sc; struct tipc_uaddr ua; bool res = true; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) sc = tipc_service_create(sub->net, &ua); if (sc) { spin_lock_bh(&sc->lock); tipc_service_subscribe(sc, sub); spin_unlock_bh(&sc->lock); } else { pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, sub->s.seq.lower, sub->s.seq.upper); res = false; } spin_unlock_bh(&tn->nametbl_lock); return res; } /** * tipc_nametbl_unsubscribe - remove a subscription object from name table * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); struct tipc_service *sc; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); list_del_init(&sub->service_list); tipc_sub_put(sub); /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } spin_unlock_bh(&sc->lock); exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct name_table *nt; int i; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) INIT_HLIST_HEAD(&nt->services[i]); INIT_LIST_HEAD(&nt->node_scope); INIT_LIST_HEAD(&nt->cluster_scope); rwlock_init(&nt->cluster_scope_lock); tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** * tipc_service_delete - purge all publications for a service and delete it * @net: the associated network namespace * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { struct service_range *sr, *tmpr; struct publication *p, *tmp; spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { tipc_service_remove_publ(sr, &p->sk, p->key); kfree_rcu(p, rcu); } rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); spin_unlock_bh(&sc->lock); kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct hlist_head *service_head; struct tipc_service *service; u32 i; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { if (hlist_empty(&nt->services[i])) continue; service_head = &nt->services[i]; hlist_for_each_entry_rcu(service, service_head, service_list) { tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); synchronize_net(); kfree(nt); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct tipc_service *service, struct service_range *sr, u32 *last_key) { struct publication *p; struct nlattr *attrs; struct nlattr *b; void *hdr; if (*last_key) { list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, struct publication, all_publ); } list_for_each_entry_from(p, &sr->all_publ, all_publ) { *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NAME_TABLE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE); if (!attrs) goto msg_full; b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); if (!b) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } *last_key = 0; return 0; publ_msg_full: nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, struct tipc_service *sc, u32 *last_lower, u32 *last_key) { struct service_range *sr; struct rb_node *n; int err; for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower < *last_lower) continue; err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { *last_lower = sr->lower; return err; } } *last_lower = 0; return 0; } static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, u32 *last_key) { struct tipc_net *tn = tipc_net(net); struct tipc_service *service = NULL; struct hlist_head *head; struct tipc_uaddr ua; int err; int i; if (*last_type) i = hash(*last_type); else i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; if (*last_type || (!i && *last_key && (*last_lower == *last_key))) { tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, *last_type, *last_lower, *last_lower); service = tipc_service_find(net, &ua); if (!service) return -EPIPE; } else { hlist_for_each_entry_rcu(service, head, service_list) break; if (!service) continue; } hlist_for_each_entry_from_rcu(service, service_list) { spin_lock_bh(&service->lock); err = __tipc_nl_service_range_list(msg, service, last_lower, last_key); if (err) { *last_type = service->type; spin_unlock_bh(&service->lock); return err; } spin_unlock_bh(&service->lock); } *last_type = 0; } return 0; } int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; u32 last_key = cb->args[2]; int done = cb->args[3]; struct tipc_nl_msg msg; int err; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); err = tipc_nl_service_list(net, &msg, &last_type, &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { /* We never set seq or call nl_dump_check_consistent() this * means that setting prev_seq here will cause the consistence * check to fail in the netlink callback handler. Resulting in * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if * we got an error. */ cb->prev_seq = 1; } rcu_read_unlock(); cb->args[0] = last_type; cb->args[1] = last_lower; cb->args[2] = last_key; cb->args[3] = done; return skb->len; } struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; list_for_each_entry(dst, l, list) { if (dst->node == node && dst->port == port) return dst; } return NULL; } bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; if (tipc_dest_find(l, node, port)) return false; dst = kmalloc(sizeof(*dst), GFP_ATOMIC); if (unlikely(!dst)) return false; dst->node = node; dst->port = port; list_add(&dst->list, l); return true; } bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { struct tipc_dest *dst; if (list_empty(l)) return false; dst = list_first_entry(l, typeof(*dst), list); if (port) *port = dst->port; if (node) *node = dst->node; list_del(&dst->list); kfree(dst); return true; } bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; dst = tipc_dest_find(l, node, port); if (!dst) return false; list_del(&dst->list); kfree(dst); return true; } void tipc_dest_list_purge(struct list_head *l) { struct tipc_dest *dst, *tmp; list_for_each_entry_safe(dst, tmp, l, list) { list_del(&dst->list); kfree(dst); } } |
5 5 56 56 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 | /* * Copyright (c) 2006-2008 Intel Corporation * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> * * DRM core CRTC related functions * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. * * Authors: * Keith Packard * Eric Anholt <eric@anholt.net> * Dave Airlie <airlied@linux.ie> * Jesse Barnes <jesse.barnes@intel.com> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/dynamic_debug.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_atomic_uapi.h> #include <drm/drm_bridge.h> #include <drm/drm_crtc.h> #include <drm/drm_crtc_helper.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_print.h> #include <drm/drm_vblank.h> #include "drm_crtc_helper_internal.h" DECLARE_DYNDBG_CLASSMAP(drm_debug_classes, DD_CLASS_TYPE_DISJOINT_BITS, 0, "DRM_UT_CORE", "DRM_UT_DRIVER", "DRM_UT_KMS", "DRM_UT_PRIME", "DRM_UT_ATOMIC", "DRM_UT_VBL", "DRM_UT_STATE", "DRM_UT_LEASE", "DRM_UT_DP", "DRM_UT_DRMRES"); /** * DOC: overview * * The CRTC modeset helper library provides a default set_config implementation * in drm_crtc_helper_set_config(). Plus a few other convenience functions using * the same callbacks which drivers can use to e.g. restore the modeset * configuration on resume with drm_helper_resume_force_mode(). * * Note that this helper library doesn't track the current power state of CRTCs * and encoders. It can call callbacks like &drm_encoder_helper_funcs.dpms even * though the hardware is already in the desired state. This deficiency has been * fixed in the atomic helpers. * * The driver callbacks are mostly compatible with the atomic modeset helpers, * except for the handling of the primary plane: Atomic helpers require that the * primary plane is implemented as a real standalone plane and not directly tied * to the CRTC state. For easier transition this library provides functions to * implement the old semantics required by the CRTC helpers using the new plane * and atomic helper callbacks. * * Drivers are strongly urged to convert to the atomic helpers (by way of first * converting to the plane helpers). New drivers must not use these functions * but need to implement the atomic interface instead, potentially using the * atomic helpers for that. * * These legacy modeset helpers use the same function table structures as * all other modesetting helpers. See the documentation for struct * &drm_crtc_helper_funcs, &struct drm_encoder_helper_funcs and struct * &drm_connector_helper_funcs. */ /** * drm_helper_encoder_in_use - check if a given encoder is in use * @encoder: encoder to check * * Checks whether @encoder is with the current mode setting output configuration * in use by any connector. This doesn't mean that it is actually enabled since * the DPMS state is tracked separately. * * Returns: * True if @encoder is used, false otherwise. */ bool drm_helper_encoder_in_use(struct drm_encoder *encoder) { struct drm_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_device *dev = encoder->dev; drm_WARN_ON(dev, drm_drv_uses_atomic_modeset(dev)); /* * We can expect this mutex to be locked if we are not panicking. * Locking is currently fubar in the panic handler. */ if (!oops_in_progress) { drm_WARN_ON(dev, !mutex_is_locked(&dev->mode_config.mutex)); drm_WARN_ON(dev, !drm_modeset_is_locked(&dev->mode_config.connection_mutex)); } drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (connector->encoder == encoder) { drm_connector_list_iter_end(&conn_iter); return true; } } drm_connector_list_iter_end(&conn_iter); return false; } EXPORT_SYMBOL(drm_helper_encoder_in_use); /** * drm_helper_crtc_in_use - check if a given CRTC is in a mode_config * @crtc: CRTC to check * * Checks whether @crtc is with the current mode setting output configuration * in use by any connector. This doesn't mean that it is actually enabled since * the DPMS state is tracked separately. * * Returns: * True if @crtc is used, false otherwise. */ bool drm_helper_crtc_in_use(struct drm_crtc *crtc) { struct drm_encoder *encoder; struct drm_device *dev = crtc->dev; drm_WARN_ON(dev, drm_drv_uses_atomic_modeset(dev)); /* * We can expect this mutex to be locked if we are not panicking. * Locking is currently fubar in the panic handler. */ if (!oops_in_progress) drm_WARN_ON(dev, !mutex_is_locked(&dev->mode_config.mutex)); drm_for_each_encoder(encoder, dev) if (encoder->crtc == crtc && drm_helper_encoder_in_use(encoder)) return true; return false; } EXPORT_SYMBOL(drm_helper_crtc_in_use); static void drm_encoder_disable(struct drm_encoder *encoder) { const struct drm_encoder_helper_funcs *encoder_funcs = encoder->helper_private; if (!encoder_funcs) return; if (encoder_funcs->disable) (*encoder_funcs->disable)(encoder); else if (encoder_funcs->dpms) (*encoder_funcs->dpms)(encoder, DRM_MODE_DPMS_OFF); } static void __drm_helper_disable_unused_functions(struct drm_device *dev) { struct drm_encoder *encoder; struct drm_crtc *crtc; drm_warn_on_modeset_not_all_locked(dev); drm_for_each_encoder(encoder, dev) { if (!drm_helper_encoder_in_use(encoder)) { drm_encoder_disable(encoder); /* disconnect encoder from any connector */ encoder->crtc = NULL; } } drm_for_each_crtc(crtc, dev) { const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; crtc->enabled = drm_helper_crtc_in_use(crtc); if (!crtc->enabled) { if (crtc_funcs->disable) (*crtc_funcs->disable)(crtc); else (*crtc_funcs->dpms)(crtc, DRM_MODE_DPMS_OFF); crtc->primary->fb = NULL; } } } /** * drm_helper_disable_unused_functions - disable unused objects * @dev: DRM device * * This function walks through the entire mode setting configuration of @dev. It * will remove any CRTC links of unused encoders and encoder links of * disconnected connectors. Then it will disable all unused encoders and CRTCs * either by calling their disable callback if available or by calling their * dpms callback with DRM_MODE_DPMS_OFF. * * NOTE: * * This function is part of the legacy modeset helper library and will cause * major confusion with atomic drivers. This is because atomic helpers guarantee * to never call ->disable() hooks on a disabled function, or ->enable() hooks * on an enabled functions. drm_helper_disable_unused_functions() on the other * hand throws such guarantees into the wind and calls disable hooks * unconditionally on unused functions. */ void drm_helper_disable_unused_functions(struct drm_device *dev) { drm_WARN_ON(dev, drm_drv_uses_atomic_modeset(dev)); drm_modeset_lock_all(dev); __drm_helper_disable_unused_functions(dev); drm_modeset_unlock_all(dev); } EXPORT_SYMBOL(drm_helper_disable_unused_functions); /* * Check the CRTC we're going to map each output to vs. its current * CRTC. If they don't match, we have to disable the output and the CRTC * since the driver will have to re-route things. */ static void drm_crtc_prepare_encoders(struct drm_device *dev) { const struct drm_encoder_helper_funcs *encoder_funcs; struct drm_encoder *encoder; drm_for_each_encoder(encoder, dev) { encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; /* Disable unused encoders */ if (encoder->crtc == NULL) drm_encoder_disable(encoder); } } /** * drm_crtc_helper_set_mode - internal helper to set a mode * @crtc: CRTC to program * @mode: mode to use * @x: horizontal offset into the surface * @y: vertical offset into the surface * @old_fb: old framebuffer, for cleanup * * Try to set @mode on @crtc. Give @crtc and its associated connectors a chance * to fixup or reject the mode prior to trying to set it. This is an internal * helper that drivers could e.g. use to update properties that require the * entire output pipe to be disabled and re-enabled in a new configuration. For * example for changing whether audio is enabled on a hdmi link or for changing * panel fitter or dither attributes. It is also called by the * drm_crtc_helper_set_config() helper function to drive the mode setting * sequence. * * Returns: * True if the mode was set successfully, false otherwise. */ bool drm_crtc_helper_set_mode(struct drm_crtc *crtc, struct drm_display_mode *mode, int x, int y, struct drm_framebuffer *old_fb) { struct drm_device *dev = crtc->dev; struct drm_display_mode *adjusted_mode, saved_mode, saved_hwmode; const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; const struct drm_encoder_helper_funcs *encoder_funcs; int saved_x, saved_y; bool saved_enabled; struct drm_encoder *encoder; bool ret = true; drm_WARN_ON(dev, drm_drv_uses_atomic_modeset(dev)); drm_warn_on_modeset_not_all_locked(dev); saved_enabled = crtc->enabled; crtc->enabled = drm_helper_crtc_in_use(crtc); if (!crtc->enabled) return true; adjusted_mode = drm_mode_duplicate(dev, mode); if (!adjusted_mode) { crtc->enabled = saved_enabled; return false; } drm_mode_init(&saved_mode, &crtc->mode); drm_mode_init(&saved_hwmode, &crtc->hwmode); saved_x = crtc->x; saved_y = crtc->y; /* Update crtc values up front so the driver can rely on them for mode * setting. */ drm_mode_copy(&crtc->mode, mode); crtc->x = x; crtc->y = y; /* Pass our mode to the connectors and the CRTC to give them a chance to * adjust it according to limitations or connector properties, and also * a chance to reject the mode entirely. */ drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; encoder_funcs = encoder->helper_private; if (encoder_funcs->mode_fixup) { if (!(ret = encoder_funcs->mode_fixup(encoder, mode, adjusted_mode))) { drm_dbg_kms(dev, "[ENCODER:%d:%s] mode fixup failed\n", encoder->base.id, encoder->name); goto done; } } } if (crtc_funcs->mode_fixup) { if (!(ret = crtc_funcs->mode_fixup(crtc, mode, adjusted_mode))) { drm_dbg_kms(dev, "[CRTC:%d:%s] mode fixup failed\n", crtc->base.id, crtc->name); goto done; } } drm_dbg_kms(dev, "[CRTC:%d:%s]\n", crtc->base.id, crtc->name); drm_mode_copy(&crtc->hwmode, adjusted_mode); /* Prepare the encoders and CRTCs before setting the mode. */ drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; /* Disable the encoders as the first thing we do. */ if (encoder_funcs->prepare) encoder_funcs->prepare(encoder); } drm_crtc_prepare_encoders(dev); crtc_funcs->prepare(crtc); /* Set up the DPLL and any encoders state that needs to adjust or depend * on the DPLL. */ ret = !crtc_funcs->mode_set(crtc, mode, adjusted_mode, x, y, old_fb); if (!ret) goto done; drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; drm_dbg_kms(dev, "[ENCODER:%d:%s] set [MODE:%s]\n", encoder->base.id, encoder->name, mode->name); if (encoder_funcs->mode_set) encoder_funcs->mode_set(encoder, mode, adjusted_mode); } /* Now enable the clocks, plane, pipe, and connectors that we set up. */ crtc_funcs->commit(crtc); drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; if (encoder_funcs->commit) encoder_funcs->commit(encoder); } /* Calculate and store various constants which * are later needed by vblank and swap-completion * timestamping. They are derived from true hwmode. */ drm_calc_timestamping_constants(crtc, &crtc->hwmode); /* FIXME: add subpixel order */ done: drm_mode_destroy(dev, adjusted_mode); if (!ret) { crtc->enabled = saved_enabled; drm_mode_copy(&crtc->mode, &saved_mode); drm_mode_copy(&crtc->hwmode, &saved_hwmode); crtc->x = saved_x; crtc->y = saved_y; } return ret; } EXPORT_SYMBOL(drm_crtc_helper_set_mode); /** * drm_crtc_helper_atomic_check() - Helper to check CRTC atomic-state * @crtc: CRTC to check * @state: atomic state object * * Provides a default CRTC-state check handler for CRTCs that only have * one primary plane attached to it. This is often the case for the CRTC * of simple framebuffers. * * RETURNS: * Zero on success, or an errno code otherwise. */ int drm_crtc_helper_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state) { struct drm_crtc_state *new_crtc_state = drm_atomic_get_new_crtc_state(state, crtc); if (!new_crtc_state->enable) return 0; return drm_atomic_helper_check_crtc_primary_plane(new_crtc_state); } EXPORT_SYMBOL(drm_crtc_helper_atomic_check); static void drm_crtc_helper_disable(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; struct drm_connector *connector; struct drm_encoder *encoder; /* Decouple all encoders and their attached connectors from this crtc */ drm_for_each_encoder(encoder, dev) { struct drm_connector_list_iter conn_iter; if (encoder->crtc != crtc) continue; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (connector->encoder != encoder) continue; connector->encoder = NULL; /* * drm_helper_disable_unused_functions() ought to be * doing this, but since we've decoupled the encoder * from the connector above, the required connection * between them is henceforth no longer available. */ connector->dpms = DRM_MODE_DPMS_OFF; /* we keep a reference while the encoder is bound */ drm_connector_put(connector); } drm_connector_list_iter_end(&conn_iter); } __drm_helper_disable_unused_functions(dev); } /* * For connectors that support multiple encoders, either the * .atomic_best_encoder() or .best_encoder() operation must be implemented. */ struct drm_encoder * drm_connector_get_single_encoder(struct drm_connector *connector) { struct drm_encoder *encoder; drm_WARN_ON(connector->dev, hweight32(connector->possible_encoders) > 1); drm_connector_for_each_possible_encoder(connector, encoder) return encoder; return NULL; } /** * drm_crtc_helper_set_config - set a new config from userspace * @set: mode set configuration * @ctx: lock acquire context, not used here * * The drm_crtc_helper_set_config() helper function implements the of * &drm_crtc_funcs.set_config callback for drivers using the legacy CRTC * helpers. * * It first tries to locate the best encoder for each connector by calling the * connector @drm_connector_helper_funcs.best_encoder helper operation. * * After locating the appropriate encoders, the helper function will call the * mode_fixup encoder and CRTC helper operations to adjust the requested mode, * or reject it completely in which case an error will be returned to the * application. If the new configuration after mode adjustment is identical to * the current configuration the helper function will return without performing * any other operation. * * If the adjusted mode is identical to the current mode but changes to the * frame buffer need to be applied, the drm_crtc_helper_set_config() function * will call the CRTC &drm_crtc_helper_funcs.mode_set_base helper operation. * * If the adjusted mode differs from the current mode, or if the * ->mode_set_base() helper operation is not provided, the helper function * performs a full mode set sequence by calling the ->prepare(), ->mode_set() * and ->commit() CRTC and encoder helper operations, in that order. * Alternatively it can also use the dpms and disable helper operations. For * details see &struct drm_crtc_helper_funcs and struct * &drm_encoder_helper_funcs. * * This function is deprecated. New drivers must implement atomic modeset * support, for which this function is unsuitable. Instead drivers should use * drm_atomic_helper_set_config(). * * Returns: * Returns 0 on success, negative errno numbers on failure. */ int drm_crtc_helper_set_config(struct drm_mode_set *set, struct drm_modeset_acquire_ctx *ctx) { struct drm_device *dev; struct drm_crtc **save_encoder_crtcs, *new_crtc; struct drm_encoder **save_connector_encoders, *new_encoder, *encoder; bool mode_changed = false; /* if true do a full mode set */ bool fb_changed = false; /* if true and !mode_changed just do a flip */ struct drm_connector *connector; struct drm_connector_list_iter conn_iter; int count = 0, ro, fail = 0; const struct drm_crtc_helper_funcs *crtc_funcs; struct drm_mode_set save_set; int ret; int i; BUG_ON(!set); BUG_ON(!set->crtc); BUG_ON(!set->crtc->helper_private); /* Enforce sane interface api - has been abused by the fb helper. */ BUG_ON(!set->mode && set->fb); BUG_ON(set->fb && set->num_connectors == 0); crtc_funcs = set->crtc->helper_private; dev = set->crtc->dev; drm_dbg_kms(dev, "\n"); drm_WARN_ON(dev, drm_drv_uses_atomic_modeset(dev)); if (!set->mode) set->fb = NULL; if (set->fb) { drm_dbg_kms(dev, "[CRTC:%d:%s] [FB:%d] #connectors=%d (x y) (%i %i)\n", set->crtc->base.id, set->crtc->name, set->fb->base.id, (int)set->num_connectors, set->x, set->y); } else { drm_dbg_kms(dev, "[CRTC:%d:%s] [NOFB]\n", set->crtc->base.id, set->crtc->name); drm_crtc_helper_disable(set->crtc); return 0; } drm_warn_on_modeset_not_all_locked(dev); /* * Allocate space for the backup of all (non-pointer) encoder and * connector data. */ save_encoder_crtcs = kcalloc(dev->mode_config.num_encoder, sizeof(struct drm_crtc *), GFP_KERNEL); if (!save_encoder_crtcs) return -ENOMEM; save_connector_encoders = kcalloc(dev->mode_config.num_connector, sizeof(struct drm_encoder *), GFP_KERNEL); if (!save_connector_encoders) { kfree(save_encoder_crtcs); return -ENOMEM; } /* * Copy data. Note that driver private data is not affected. * Should anything bad happen only the expected state is * restored, not the drivers personal bookkeeping. */ count = 0; drm_for_each_encoder(encoder, dev) { save_encoder_crtcs[count++] = encoder->crtc; } count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) save_connector_encoders[count++] = connector->encoder; drm_connector_list_iter_end(&conn_iter); save_set.crtc = set->crtc; save_set.mode = &set->crtc->mode; save_set.x = set->crtc->x; save_set.y = set->crtc->y; save_set.fb = set->crtc->primary->fb; /* We should be able to check here if the fb has the same properties * and then just flip_or_move it */ if (set->crtc->primary->fb != set->fb) { /* If we have no fb then treat it as a full mode set */ if (set->crtc->primary->fb == NULL) { drm_dbg_kms(dev, "[CRTC:%d:%s] no fb, full mode set\n", set->crtc->base.id, set->crtc->name); mode_changed = true; } else if (set->fb->format != set->crtc->primary->fb->format) { mode_changed = true; } else fb_changed = true; } if (set->x != set->crtc->x || set->y != set->crtc->y) fb_changed = true; if (!drm_mode_equal(set->mode, &set->crtc->mode)) { drm_dbg_kms(dev, "[CRTC:%d:%s] modes are different, full mode set:\n", set->crtc->base.id, set->crtc->name); drm_dbg_kms(dev, DRM_MODE_FMT "\n", DRM_MODE_ARG(&set->crtc->mode)); drm_dbg_kms(dev, DRM_MODE_FMT "\n", DRM_MODE_ARG(set->mode)); mode_changed = true; } /* take a reference on all unbound connectors in set, reuse the * already taken reference for bound connectors */ for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro]->encoder) continue; drm_connector_get(set->connectors[ro]); } /* a) traverse passed in connector list and get encoders for them */ count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { const struct drm_connector_helper_funcs *connector_funcs = connector->helper_private; new_encoder = connector->encoder; for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro] == connector) { if (connector_funcs->best_encoder) new_encoder = connector_funcs->best_encoder(connector); else new_encoder = drm_connector_get_single_encoder(connector); /* if we can't get an encoder for a connector we are setting now - then fail */ if (new_encoder == NULL) /* don't break so fail path works correct */ fail = 1; if (connector->dpms != DRM_MODE_DPMS_ON) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] DPMS not on, full mode switch\n", connector->base.id, connector->name); mode_changed = true; } break; } } if (new_encoder != connector->encoder) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] encoder changed, full mode switch\n", connector->base.id, connector->name); mode_changed = true; /* If the encoder is reused for another connector, then * the appropriate crtc will be set later. */ if (connector->encoder) connector->encoder->crtc = NULL; connector->encoder = new_encoder; } } drm_connector_list_iter_end(&conn_iter); if (fail) { ret = -EINVAL; goto fail; } count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (!connector->encoder) continue; if (connector->encoder->crtc == set->crtc) new_crtc = NULL; else new_crtc = connector->encoder->crtc; for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro] == connector) new_crtc = set->crtc; } /* Make sure the new CRTC will work with the encoder */ if (new_crtc && !drm_encoder_crtc_ok(connector->encoder, new_crtc)) { ret = -EINVAL; drm_connector_list_iter_end(&conn_iter); goto fail; } if (new_crtc != connector->encoder->crtc) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] CRTC changed, full mode switch\n", connector->base.id, connector->name); mode_changed = true; connector->encoder->crtc = new_crtc; } if (new_crtc) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] to [CRTC:%d:%s]\n", connector->base.id, connector->name, new_crtc->base.id, new_crtc->name); } else { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] to [NOCRTC]\n", connector->base.id, connector->name); } } drm_connector_list_iter_end(&conn_iter); /* mode_set_base is not a required function */ if (fb_changed && !crtc_funcs->mode_set_base) mode_changed = true; if (mode_changed) { if (drm_helper_crtc_in_use(set->crtc)) { drm_dbg_kms(dev, "[CRTC:%d:%s] attempting to set mode from userspace: " DRM_MODE_FMT "\n", set->crtc->base.id, set->crtc->name, DRM_MODE_ARG(set->mode)); set->crtc->primary->fb = set->fb; if (!drm_crtc_helper_set_mode(set->crtc, set->mode, set->x, set->y, save_set.fb)) { drm_err(dev, "[CRTC:%d:%s] failed to set mode\n", set->crtc->base.id, set->crtc->name); set->crtc->primary->fb = save_set.fb; ret = -EINVAL; goto fail; } drm_dbg_kms(dev, "[CRTC:%d:%s] Setting connector DPMS state to on\n", set->crtc->base.id, set->crtc->name); for (i = 0; i < set->num_connectors; i++) { drm_dbg_kms(dev, "\t[CONNECTOR:%d:%s] set DPMS on\n", set->connectors[i]->base.id, set->connectors[i]->name); set->connectors[i]->funcs->dpms(set->connectors[i], DRM_MODE_DPMS_ON); } } __drm_helper_disable_unused_functions(dev); } else if (fb_changed) { set->crtc->x = set->x; set->crtc->y = set->y; set->crtc->primary->fb = set->fb; ret = crtc_funcs->mode_set_base(set->crtc, set->x, set->y, save_set.fb); if (ret != 0) { set->crtc->x = save_set.x; set->crtc->y = save_set.y; set->crtc->primary->fb = save_set.fb; goto fail; } } kfree(save_connector_encoders); kfree(save_encoder_crtcs); return 0; fail: /* Restore all previous data. */ count = 0; drm_for_each_encoder(encoder, dev) { encoder->crtc = save_encoder_crtcs[count++]; } count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) connector->encoder = save_connector_encoders[count++]; drm_connector_list_iter_end(&conn_iter); /* after fail drop reference on all unbound connectors in set, let * bound connectors keep their reference */ for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro]->encoder) continue; drm_connector_put(set->connectors[ro]); } /* Try to restore the config */ if (mode_changed && !drm_crtc_helper_set_mode(save_set.crtc, save_set.mode, save_set.x, save_set.y, save_set.fb)) drm_err(dev, "failed to restore config after modeset failure\n"); kfree(save_connector_encoders); kfree(save_encoder_crtcs); return ret; } EXPORT_SYMBOL(drm_crtc_helper_set_config); static int drm_helper_choose_encoder_dpms(struct drm_encoder *encoder) { int dpms = DRM_MODE_DPMS_OFF; struct drm_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_device *dev = encoder->dev; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) if (connector->encoder == encoder) if (connector->dpms < dpms) dpms = connector->dpms; drm_connector_list_iter_end(&conn_iter); return dpms; } /* Helper which handles bridge ordering around encoder dpms */ static void drm_helper_encoder_dpms(struct drm_encoder *encoder, int mode) { const struct drm_encoder_helper_funcs *encoder_funcs; encoder_funcs = encoder->helper_private; if (!encoder_funcs) return; if (encoder_funcs->dpms) encoder_funcs->dpms(encoder, mode); } static int drm_helper_choose_crtc_dpms(struct drm_crtc *crtc) { int dpms = DRM_MODE_DPMS_OFF; struct drm_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_device *dev = crtc->dev; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) if (connector->encoder && connector->encoder->crtc == crtc) if (connector->dpms < dpms) dpms = connector->dpms; drm_connector_list_iter_end(&conn_iter); return dpms; } /** * drm_helper_connector_dpms() - connector dpms helper implementation * @connector: affected connector * @mode: DPMS mode * * The drm_helper_connector_dpms() helper function implements the * &drm_connector_funcs.dpms callback for drivers using the legacy CRTC * helpers. * * This is the main helper function provided by the CRTC helper framework for * implementing the DPMS connector attribute. It computes the new desired DPMS * state for all encoders and CRTCs in the output mesh and calls the * &drm_crtc_helper_funcs.dpms and &drm_encoder_helper_funcs.dpms callbacks * provided by the driver. * * This function is deprecated. New drivers must implement atomic modeset * support, where DPMS is handled in the DRM core. * * Returns: * Always returns 0. */ int drm_helper_connector_dpms(struct drm_connector *connector, int mode) { struct drm_encoder *encoder = connector->encoder; struct drm_crtc *crtc = encoder ? encoder->crtc : NULL; int old_dpms, encoder_dpms = DRM_MODE_DPMS_OFF; drm_WARN_ON(connector->dev, drm_drv_uses_atomic_modeset(connector->dev)); if (mode == connector->dpms) return 0; old_dpms = connector->dpms; connector->dpms = mode; if (encoder) encoder_dpms = drm_helper_choose_encoder_dpms(encoder); /* from off to on, do crtc then encoder */ if (mode < old_dpms) { if (crtc) { const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; if (crtc_funcs->dpms) (*crtc_funcs->dpms) (crtc, drm_helper_choose_crtc_dpms(crtc)); } if (encoder) drm_helper_encoder_dpms(encoder, encoder_dpms); } /* from on to off, do encoder then crtc */ if (mode > old_dpms) { if (encoder) drm_helper_encoder_dpms(encoder, encoder_dpms); if (crtc) { const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; if (crtc_funcs->dpms) (*crtc_funcs->dpms) (crtc, drm_helper_choose_crtc_dpms(crtc)); } } return 0; } EXPORT_SYMBOL(drm_helper_connector_dpms); /** * drm_helper_resume_force_mode - force-restore mode setting configuration * @dev: drm_device which should be restored * * Drivers which use the mode setting helpers can use this function to * force-restore the mode setting configuration e.g. on resume or when something * else might have trampled over the hw state (like some overzealous old BIOSen * tended to do). * * This helper doesn't provide a error return value since restoring the old * config should never fail due to resource allocation issues since the driver * has successfully set the restored configuration already. Hence this should * boil down to the equivalent of a few dpms on calls, which also don't provide * an error code. * * Drivers where simply restoring an old configuration again might fail (e.g. * due to slight differences in allocating shared resources when the * configuration is restored in a different order than when userspace set it up) * need to use their own restore logic. * * This function is deprecated. New drivers should implement atomic mode- * setting and use the atomic suspend/resume helpers. * * See also: * drm_atomic_helper_suspend(), drm_atomic_helper_resume() */ void drm_helper_resume_force_mode(struct drm_device *dev) { struct drm_crtc *crtc; struct drm_encoder *encoder; const struct drm_crtc_helper_funcs *crtc_funcs; int encoder_dpms; bool ret; drm_WARN_ON(dev, drm_drv_uses_atomic_modeset(dev)); drm_modeset_lock_all(dev); drm_for_each_crtc(crtc, dev) { if (!crtc->enabled) continue; ret = drm_crtc_helper_set_mode(crtc, &crtc->mode, crtc->x, crtc->y, crtc->primary->fb); /* Restoring the old config should never fail! */ if (ret == false) drm_err(dev, "failed to set mode on crtc %p\n", crtc); /* Turn off outputs that were already powered off */ if (drm_helper_choose_crtc_dpms(crtc)) { drm_for_each_encoder(encoder, dev) { if(encoder->crtc != crtc) continue; encoder_dpms = drm_helper_choose_encoder_dpms( encoder); drm_helper_encoder_dpms(encoder, encoder_dpms); } crtc_funcs = crtc->helper_private; if (crtc_funcs->dpms) (*crtc_funcs->dpms) (crtc, drm_helper_choose_crtc_dpms(crtc)); } } /* disable the unused connectors while restoring the modesetting */ __drm_helper_disable_unused_functions(dev); drm_modeset_unlock_all(dev); } EXPORT_SYMBOL(drm_helper_resume_force_mode); /** * drm_helper_force_disable_all - Forcibly turn off all enabled CRTCs * @dev: DRM device whose CRTCs to turn off * * Drivers may want to call this on unload to ensure that all displays are * unlit and the GPU is in a consistent, low power state. Takes modeset locks. * * Note: This should only be used by non-atomic legacy drivers. For an atomic * version look at drm_atomic_helper_shutdown(). * * Returns: * Zero on success, error code on failure. */ int drm_helper_force_disable_all(struct drm_device *dev) { struct drm_crtc *crtc; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_crtc(crtc, dev) if (crtc->enabled) { struct drm_mode_set set = { .crtc = crtc, }; ret = drm_mode_set_config_internal(&set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } EXPORT_SYMBOL(drm_helper_force_disable_all); |
69 67 69 84 78 10 8 85 84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | // SPDX-License-Identifier: MIT /* * Copyright (C) 2019 Google, Inc. * * Authors: * Sean Paul <seanpaul@chromium.org> */ #include <linux/average.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_mode_config.h> #include <drm/drm_modeset_lock.h> #include <drm/drm_print.h> #include <drm/drm_self_refresh_helper.h> /** * DOC: overview * * This helper library provides an easy way for drivers to leverage the atomic * framework to implement panel self refresh (SR) support. Drivers are * responsible for initializing and cleaning up the SR helpers on load/unload * (see &drm_self_refresh_helper_init/&drm_self_refresh_helper_cleanup). * The connector is responsible for setting * &drm_connector_state.self_refresh_aware to true at runtime if it is SR-aware * (meaning it knows how to initiate self refresh on the panel). * * Once a crtc has enabled SR using &drm_self_refresh_helper_init, the * helpers will monitor activity and call back into the driver to enable/disable * SR as appropriate. The best way to think about this is that it's a DPMS * on/off request with &drm_crtc_state.self_refresh_active set in crtc state * that tells you to disable/enable SR on the panel instead of power-cycling it. * * During SR, drivers may choose to fully disable their crtc/encoder/bridge * hardware (in which case no driver changes are necessary), or they can inspect * &drm_crtc_state.self_refresh_active if they want to enter low power mode * without full disable (in case full disable/enable is too slow). * * SR will be deactivated if there are any atomic updates affecting the * pipe that is in SR mode. If a crtc is driving multiple connectors, all * connectors must be SR aware and all will enter/exit SR mode at the same time. * * If the crtc and connector are SR aware, but the panel connected does not * support it (or is otherwise unable to enter SR), the driver should fail * atomic_check when &drm_crtc_state.self_refresh_active is true. */ #define SELF_REFRESH_AVG_SEED_MS 200 DECLARE_EWMA(psr_time, 4, 4) struct drm_self_refresh_data { struct drm_crtc *crtc; struct delayed_work entry_work; struct mutex avg_mutex; struct ewma_psr_time entry_avg_ms; struct ewma_psr_time exit_avg_ms; }; static void drm_self_refresh_helper_entry_work(struct work_struct *work) { struct drm_self_refresh_data *sr_data = container_of( to_delayed_work(work), struct drm_self_refresh_data, entry_work); struct drm_crtc *crtc = sr_data->crtc; struct drm_device *dev = crtc->dev; struct drm_modeset_acquire_ctx ctx; struct drm_atomic_state *state; struct drm_connector *conn; struct drm_connector_state *conn_state; struct drm_crtc_state *crtc_state; int i, ret = 0; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_drop_locks; } retry: state->acquire_ctx = &ctx; crtc_state = drm_atomic_get_crtc_state(state, crtc); if (IS_ERR(crtc_state)) { ret = PTR_ERR(crtc_state); goto out; } if (!crtc_state->enable) goto out; ret = drm_atomic_add_affected_connectors(state, crtc); if (ret) goto out; for_each_new_connector_in_state(state, conn, conn_state, i) { if (!conn_state->self_refresh_aware) goto out; } crtc_state->active = false; crtc_state->self_refresh_active = true; ret = drm_atomic_commit(state); if (ret) goto out; out: if (ret == -EDEADLK) { drm_atomic_state_clear(state); ret = drm_modeset_backoff(&ctx); if (!ret) goto retry; } drm_atomic_state_put(state); out_drop_locks: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); } /** * drm_self_refresh_helper_update_avg_times - Updates a crtc's SR time averages * @state: the state which has just been applied to hardware * @commit_time_ms: the amount of time in ms that this commit took to complete * @new_self_refresh_mask: bitmask of crtc's that have self_refresh_active in * new state * * Called after &drm_mode_config_funcs.atomic_commit_tail, this function will * update the average entry/exit self refresh times on self refresh transitions. * These averages will be used when calculating how long to delay before * entering self refresh mode after activity. */ void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state, unsigned int commit_time_ms, unsigned int new_self_refresh_mask) { struct drm_crtc *crtc; struct drm_crtc_state *old_crtc_state; int i; for_each_old_crtc_in_state(state, crtc, old_crtc_state, i) { bool new_self_refresh_active = new_self_refresh_mask & BIT(i); struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; struct ewma_psr_time *time; if (old_crtc_state->self_refresh_active == new_self_refresh_active) continue; if (new_self_refresh_active) time = &sr_data->entry_avg_ms; else time = &sr_data->exit_avg_ms; mutex_lock(&sr_data->avg_mutex); ewma_psr_time_add(time, commit_time_ms); mutex_unlock(&sr_data->avg_mutex); } } EXPORT_SYMBOL(drm_self_refresh_helper_update_avg_times); /** * drm_self_refresh_helper_alter_state - Alters the atomic state for SR exit * @state: the state currently being checked * * Called at the end of atomic check. This function checks the state for flags * incompatible with self refresh exit and changes them. This is a bit * disingenuous since userspace is expecting one thing and we're giving it * another. However in order to keep self refresh entirely hidden from * userspace, this is required. * * At the end, we queue up the self refresh entry work so we can enter PSR after * the desired delay. */ void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state) { struct drm_crtc *crtc; struct drm_crtc_state *crtc_state; int i; if (state->async_update || !state->allow_modeset) { for_each_old_crtc_in_state(state, crtc, crtc_state, i) { if (crtc_state->self_refresh_active) { state->async_update = false; state->allow_modeset = true; break; } } } for_each_new_crtc_in_state(state, crtc, crtc_state, i) { struct drm_self_refresh_data *sr_data; unsigned int delay; /* Don't trigger the entry timer when we're already in SR */ if (crtc_state->self_refresh_active) continue; sr_data = crtc->self_refresh_data; if (!sr_data) continue; mutex_lock(&sr_data->avg_mutex); delay = (ewma_psr_time_read(&sr_data->entry_avg_ms) + ewma_psr_time_read(&sr_data->exit_avg_ms)) * 2; mutex_unlock(&sr_data->avg_mutex); mod_delayed_work(system_wq, &sr_data->entry_work, msecs_to_jiffies(delay)); } } EXPORT_SYMBOL(drm_self_refresh_helper_alter_state); /** * drm_self_refresh_helper_init - Initializes self refresh helpers for a crtc * @crtc: the crtc which supports self refresh supported displays * * Returns zero if successful or -errno on failure */ int drm_self_refresh_helper_init(struct drm_crtc *crtc) { struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; /* Helper is already initialized */ if (WARN_ON(sr_data)) return -EINVAL; sr_data = kzalloc(sizeof(*sr_data), GFP_KERNEL); if (!sr_data) return -ENOMEM; INIT_DELAYED_WORK(&sr_data->entry_work, drm_self_refresh_helper_entry_work); sr_data->crtc = crtc; mutex_init(&sr_data->avg_mutex); ewma_psr_time_init(&sr_data->entry_avg_ms); ewma_psr_time_init(&sr_data->exit_avg_ms); /* * Seed the averages so they're non-zero (and sufficiently large * for even poorly performing panels). As time goes on, this will be * averaged out and the values will trend to their true value. */ ewma_psr_time_add(&sr_data->entry_avg_ms, SELF_REFRESH_AVG_SEED_MS); ewma_psr_time_add(&sr_data->exit_avg_ms, SELF_REFRESH_AVG_SEED_MS); crtc->self_refresh_data = sr_data; return 0; } EXPORT_SYMBOL(drm_self_refresh_helper_init); /** * drm_self_refresh_helper_cleanup - Cleans up self refresh helpers for a crtc * @crtc: the crtc to cleanup */ void drm_self_refresh_helper_cleanup(struct drm_crtc *crtc) { struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; /* Helper is already uninitialized */ if (!sr_data) return; crtc->self_refresh_data = NULL; cancel_delayed_work_sync(&sr_data->entry_work); kfree(sr_data); } EXPORT_SYMBOL(drm_self_refresh_helper_cleanup); |
9 9 50261 50257 50257 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #include <linux/sched/debug.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> #include <linux/bug.h> #include <linux/nmi.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", [ ESTACK_VC ] = "#VC", [ ESTACK_VC2 ] = "#VC2", }; const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_TASK) return "TASK"; if (type == STACK_TYPE_IRQ) return "IRQ"; if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; if (type == STACK_TYPE_ENTRY) { /* * On 64-bit, we have a generic entry stack that we * use for all the kernel entry points, including * SYSENTER. */ return "ENTRY_TRAMPOLINE"; } if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; return NULL; } /** * struct estack_pages - Page descriptor for exception stacks * @offs: Offset from the start of the exception stack area * @size: Size of the exception stack * @type: Type to store in the stack_info struct */ struct estack_pages { u32 offs; u16 size; u16 type; }; #define EPAGERANGE(st) \ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ .offs = CEA_ESTACK_OFFS(st), \ .size = CEA_ESTACK_SIZE(st), \ .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } /* * Array of exception stack page descriptors. If the stack is larger than * PAGE_SIZE, all pages covering a particular stack will have the same * info. The guard pages including the not mapped DB2 stack are zeroed * out. */ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), EPAGERANGE(DB), EPAGERANGE(MCE), EPAGERANGE(VC), EPAGERANGE(VC2), }; static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long begin, end, stk = (unsigned long)stack; const struct estack_pages *ep; struct pt_regs *regs; unsigned int k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* * Handle the case where stack trace is collected _before_ * cea_exception_stacks had been initialized. */ if (!begin) return false; end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) return false; /* Calc page offset from start of exception stacks */ k = (stk - begin) >> PAGE_SHIFT; /* Lookup the page descriptor */ ep = &estack_pages[k]; /* Guard page? */ if (!ep->size) return false; begin += (unsigned long)ep->offs; end = begin + (unsigned long)ep->size; regs = (struct pt_regs *)end - 1; info->type = ep->type; info->begin = (unsigned long *)begin; info->end = (unsigned long *)end; info->next_sp = (unsigned long *)regs->sp; return true; } static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); unsigned long *begin; /* * @end points directly to the top most stack entry to avoid a -8 * adjustment in the stack switch hotpath. Adjust it back before * calculating @begin. */ end++; begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * Due to the switching logic RSP can never be == @end because the * final operation is 'popq %rsp' which means after that RSP points * to the original stack and not to @end. */ if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; info->begin = begin; info->end = end; /* * The next stack pointer is stored at the top of the irq stack * before switching to the irq stack. Actual stack entries are all * below that. */ info->next_sp = (unsigned long *)*(end - 1); return true; } bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info) { if (in_task_stack(stack, task, info)) return true; if (task != current) return false; if (in_exception_stack(stack, info)) return true; if (in_irq_stack(stack, info)) return true; if (in_entry_stack(stack, info)) return true; return false; } int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { task = task ? : current; if (!stack) goto unknown; if (!get_stack_info_noinstr(stack, task, info)) goto unknown; /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on: * just break out and report an unknown stack type. */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { if (task == current) printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; } return 0; unknown: info->type = STACK_TYPE_UNKNOWN; return -EINVAL; } |
2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 | // SPDX-License-Identifier: GPL-2.0-only /* * hwmon.c - part of lm_sensors, Linux kernel modules for hardware monitoring * * This file defines the sysfs class "hwmon", for use by sensors drivers. * * Copyright (C) 2005 Mark M. Hoffman <mhoffman@lightlink.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitops.h> #include <linux/device.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/hwmon.h> #include <linux/idr.h> #include <linux/kstrtox.h> #include <linux/list.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/thermal.h> #define CREATE_TRACE_POINTS #include <trace/events/hwmon.h> #define HWMON_ID_PREFIX "hwmon" #define HWMON_ID_FORMAT HWMON_ID_PREFIX "%d" struct hwmon_device { const char *name; const char *label; struct device dev; const struct hwmon_chip_info *chip; struct list_head tzdata; struct attribute_group group; const struct attribute_group **groups; }; #define to_hwmon_device(d) container_of(d, struct hwmon_device, dev) #define MAX_SYSFS_ATTR_NAME_LENGTH 32 struct hwmon_device_attribute { struct device_attribute dev_attr; const struct hwmon_ops *ops; enum hwmon_sensor_types type; u32 attr; int index; char name[MAX_SYSFS_ATTR_NAME_LENGTH]; }; #define to_hwmon_attr(d) \ container_of(d, struct hwmon_device_attribute, dev_attr) #define to_dev_attr(a) container_of(a, struct device_attribute, attr) /* * Thermal zone information */ struct hwmon_thermal_data { struct list_head node; /* hwmon tzdata list entry */ struct device *dev; /* Reference to hwmon device */ int index; /* sensor index */ struct thermal_zone_device *tzd;/* thermal zone device */ }; static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_hwmon_device(dev)->name); } static DEVICE_ATTR_RO(name); static ssize_t label_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", to_hwmon_device(dev)->label); } static DEVICE_ATTR_RO(label); static struct attribute *hwmon_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_label.attr, NULL }; static umode_t hwmon_dev_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n) { struct device *dev = kobj_to_dev(kobj); struct hwmon_device *hdev = to_hwmon_device(dev); if (attr == &dev_attr_name.attr && hdev->name == NULL) return 0; if (attr == &dev_attr_label.attr && hdev->label == NULL) return 0; return attr->mode; } static const struct attribute_group hwmon_dev_attr_group = { .attrs = hwmon_dev_attrs, .is_visible = hwmon_dev_attr_is_visible, }; static const struct attribute_group *hwmon_dev_attr_groups[] = { &hwmon_dev_attr_group, NULL }; static void hwmon_free_attrs(struct attribute **attrs) { int i; for (i = 0; attrs[i]; i++) { struct device_attribute *dattr = to_dev_attr(attrs[i]); struct hwmon_device_attribute *hattr = to_hwmon_attr(dattr); kfree(hattr); } kfree(attrs); } static void hwmon_dev_release(struct device *dev) { struct hwmon_device *hwdev = to_hwmon_device(dev); if (hwdev->group.attrs) hwmon_free_attrs(hwdev->group.attrs); kfree(hwdev->groups); kfree(hwdev->label); kfree(hwdev); } static struct class hwmon_class = { .name = "hwmon", .dev_groups = hwmon_dev_attr_groups, .dev_release = hwmon_dev_release, }; static DEFINE_IDA(hwmon_ida); /* Thermal zone handling */ /* * The complex conditional is necessary to avoid a cyclic dependency * between hwmon and thermal_sys modules. */ #ifdef CONFIG_THERMAL_OF static int hwmon_thermal_get_temp(struct thermal_zone_device *tz, int *temp) { struct hwmon_thermal_data *tdata = thermal_zone_device_priv(tz); struct hwmon_device *hwdev = to_hwmon_device(tdata->dev); int ret; long t; ret = hwdev->chip->ops->read(tdata->dev, hwmon_temp, hwmon_temp_input, tdata->index, &t); if (ret < 0) return ret; *temp = t; return 0; } static int hwmon_thermal_set_trips(struct thermal_zone_device *tz, int low, int high) { struct hwmon_thermal_data *tdata = thermal_zone_device_priv(tz); struct hwmon_device *hwdev = to_hwmon_device(tdata->dev); const struct hwmon_chip_info *chip = hwdev->chip; const struct hwmon_channel_info * const *info = chip->info; unsigned int i; int err; if (!chip->ops->write) return 0; for (i = 0; info[i] && info[i]->type != hwmon_temp; i++) continue; if (!info[i]) return 0; if (info[i]->config[tdata->index] & HWMON_T_MIN) { err = chip->ops->write(tdata->dev, hwmon_temp, hwmon_temp_min, tdata->index, low); if (err && err != -EOPNOTSUPP) return err; } if (info[i]->config[tdata->index] & HWMON_T_MAX) { err = chip->ops->write(tdata->dev, hwmon_temp, hwmon_temp_max, tdata->index, high); if (err && err != -EOPNOTSUPP) return err; } return 0; } static const struct thermal_zone_device_ops hwmon_thermal_ops = { .get_temp = hwmon_thermal_get_temp, .set_trips = hwmon_thermal_set_trips, }; static void hwmon_thermal_remove_sensor(void *data) { list_del(data); } static int hwmon_thermal_add_sensor(struct device *dev, int index) { struct hwmon_device *hwdev = to_hwmon_device(dev); struct hwmon_thermal_data *tdata; struct thermal_zone_device *tzd; int err; tdata = devm_kzalloc(dev, sizeof(*tdata), GFP_KERNEL); if (!tdata) return -ENOMEM; tdata->dev = dev; tdata->index = index; tzd = devm_thermal_of_zone_register(dev, index, tdata, &hwmon_thermal_ops); if (IS_ERR(tzd)) { if (PTR_ERR(tzd) != -ENODEV) return PTR_ERR(tzd); dev_info(dev, "temp%d_input not attached to any thermal zone\n", index + 1); devm_kfree(dev, tdata); return 0; } err = devm_add_action(dev, hwmon_thermal_remove_sensor, &tdata->node); if (err) return err; tdata->tzd = tzd; list_add(&tdata->node, &hwdev->tzdata); return 0; } static int hwmon_thermal_register_sensors(struct device *dev) { struct hwmon_device *hwdev = to_hwmon_device(dev); const struct hwmon_chip_info *chip = hwdev->chip; const struct hwmon_channel_info * const *info = chip->info; void *drvdata = dev_get_drvdata(dev); int i; for (i = 1; info[i]; i++) { int j; if (info[i]->type != hwmon_temp) continue; for (j = 0; info[i]->config[j]; j++) { int err; if (!(info[i]->config[j] & HWMON_T_INPUT) || !chip->ops->is_visible(drvdata, hwmon_temp, hwmon_temp_input, j)) continue; err = hwmon_thermal_add_sensor(dev, j); if (err) return err; } } return 0; } static void hwmon_thermal_notify(struct device *dev, int index) { struct hwmon_device *hwdev = to_hwmon_device(dev); struct hwmon_thermal_data *tzdata; list_for_each_entry(tzdata, &hwdev->tzdata, node) { if (tzdata->index == index) { thermal_zone_device_update(tzdata->tzd, THERMAL_EVENT_UNSPECIFIED); } } } #else static int hwmon_thermal_register_sensors(struct device *dev) { return 0; } static void hwmon_thermal_notify(struct device *dev, int index) { } #endif /* IS_REACHABLE(CONFIG_THERMAL) && ... */ static int hwmon_attr_base(enum hwmon_sensor_types type) { if (type == hwmon_in || type == hwmon_intrusion) return 0; return 1; } /* sysfs attribute management */ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); long val; int ret; ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, &val); if (ret < 0) return ret; trace_hwmon_attr_show(hattr->index + hwmon_attr_base(hattr->type), hattr->name, val); return sprintf(buf, "%ld\n", val); } static ssize_t hwmon_attr_show_string(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); enum hwmon_sensor_types type = hattr->type; const char *s; int ret; ret = hattr->ops->read_string(dev, hattr->type, hattr->attr, hattr->index, &s); if (ret < 0) return ret; trace_hwmon_attr_show_string(hattr->index + hwmon_attr_base(type), hattr->name, s); return sprintf(buf, "%s\n", s); } static ssize_t hwmon_attr_store(struct device *dev, struct device_attribute *devattr, const char *buf, size_t count) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); long val; int ret; ret = kstrtol(buf, 10, &val); if (ret < 0) return ret; ret = hattr->ops->write(dev, hattr->type, hattr->attr, hattr->index, val); if (ret < 0) return ret; trace_hwmon_attr_store(hattr->index + hwmon_attr_base(hattr->type), hattr->name, val); return count; } static bool is_string_attr(enum hwmon_sensor_types type, u32 attr) { return (type == hwmon_temp && attr == hwmon_temp_label) || (type == hwmon_in && attr == hwmon_in_label) || (type == hwmon_curr && attr == hwmon_curr_label) || (type == hwmon_power && attr == hwmon_power_label) || (type == hwmon_energy && attr == hwmon_energy_label) || (type == hwmon_humidity && attr == hwmon_humidity_label) || (type == hwmon_fan && attr == hwmon_fan_label); } static struct attribute *hwmon_genattr(const void *drvdata, enum hwmon_sensor_types type, u32 attr, int index, const char *template, const struct hwmon_ops *ops) { struct hwmon_device_attribute *hattr; struct device_attribute *dattr; struct attribute *a; umode_t mode; const char *name; bool is_string = is_string_attr(type, attr); /* The attribute is invisible if there is no template string */ if (!template) return ERR_PTR(-ENOENT); mode = ops->is_visible(drvdata, type, attr, index); if (!mode) return ERR_PTR(-ENOENT); if ((mode & 0444) && ((is_string && !ops->read_string) || (!is_string && !ops->read))) return ERR_PTR(-EINVAL); if ((mode & 0222) && !ops->write) return ERR_PTR(-EINVAL); hattr = kzalloc(sizeof(*hattr), GFP_KERNEL); if (!hattr) return ERR_PTR(-ENOMEM); if (type == hwmon_chip) { name = template; } else { scnprintf(hattr->name, sizeof(hattr->name), template, index + hwmon_attr_base(type)); name = hattr->name; } hattr->type = type; hattr->attr = attr; hattr->index = index; hattr->ops = ops; dattr = &hattr->dev_attr; dattr->show = is_string ? hwmon_attr_show_string : hwmon_attr_show; dattr->store = hwmon_attr_store; a = &dattr->attr; sysfs_attr_init(a); a->name = name; a->mode = mode; return a; } /* * Chip attributes are not attribute templates but actual sysfs attributes. * See hwmon_genattr() for special handling. */ static const char * const hwmon_chip_attrs[] = { [hwmon_chip_temp_reset_history] = "temp_reset_history", [hwmon_chip_in_reset_history] = "in_reset_history", [hwmon_chip_curr_reset_history] = "curr_reset_history", [hwmon_chip_power_reset_history] = "power_reset_history", [hwmon_chip_update_interval] = "update_interval", [hwmon_chip_alarms] = "alarms", [hwmon_chip_samples] = "samples", [hwmon_chip_curr_samples] = "curr_samples", [hwmon_chip_in_samples] = "in_samples", [hwmon_chip_power_samples] = "power_samples", [hwmon_chip_temp_samples] = "temp_samples", [hwmon_chip_beep_enable] = "beep_enable", }; static const char * const hwmon_temp_attr_templates[] = { [hwmon_temp_enable] = "temp%d_enable", [hwmon_temp_input] = "temp%d_input", [hwmon_temp_type] = "temp%d_type", [hwmon_temp_lcrit] = "temp%d_lcrit", [hwmon_temp_lcrit_hyst] = "temp%d_lcrit_hyst", [hwmon_temp_min] = "temp%d_min", [hwmon_temp_min_hyst] = "temp%d_min_hyst", [hwmon_temp_max] = "temp%d_max", [hwmon_temp_max_hyst] = "temp%d_max_hyst", [hwmon_temp_crit] = "temp%d_crit", [hwmon_temp_crit_hyst] = "temp%d_crit_hyst", [hwmon_temp_emergency] = "temp%d_emergency", [hwmon_temp_emergency_hyst] = "temp%d_emergency_hyst", [hwmon_temp_alarm] = "temp%d_alarm", [hwmon_temp_lcrit_alarm] = "temp%d_lcrit_alarm", [hwmon_temp_min_alarm] = "temp%d_min_alarm", [hwmon_temp_max_alarm] = "temp%d_max_alarm", [hwmon_temp_crit_alarm] = "temp%d_crit_alarm", [hwmon_temp_emergency_alarm] = "temp%d_emergency_alarm", [hwmon_temp_fault] = "temp%d_fault", [hwmon_temp_offset] = "temp%d_offset", [hwmon_temp_label] = "temp%d_label", [hwmon_temp_lowest] = "temp%d_lowest", [hwmon_temp_highest] = "temp%d_highest", [hwmon_temp_reset_history] = "temp%d_reset_history", [hwmon_temp_rated_min] = "temp%d_rated_min", [hwmon_temp_rated_max] = "temp%d_rated_max", [hwmon_temp_beep] = "temp%d_beep", }; static const char * const hwmon_in_attr_templates[] = { [hwmon_in_enable] = "in%d_enable", [hwmon_in_input] = "in%d_input", [hwmon_in_min] = "in%d_min", [hwmon_in_max] = "in%d_max", [hwmon_in_lcrit] = "in%d_lcrit", [hwmon_in_crit] = "in%d_crit", [hwmon_in_average] = "in%d_average", [hwmon_in_lowest] = "in%d_lowest", [hwmon_in_highest] = "in%d_highest", [hwmon_in_reset_history] = "in%d_reset_history", [hwmon_in_label] = "in%d_label", [hwmon_in_alarm] = "in%d_alarm", [hwmon_in_min_alarm] = "in%d_min_alarm", [hwmon_in_max_alarm] = "in%d_max_alarm", [hwmon_in_lcrit_alarm] = "in%d_lcrit_alarm", [hwmon_in_crit_alarm] = "in%d_crit_alarm", [hwmon_in_rated_min] = "in%d_rated_min", [hwmon_in_rated_max] = "in%d_rated_max", [hwmon_in_beep] = "in%d_beep", [hwmon_in_fault] = "in%d_fault", }; static const char * const hwmon_curr_attr_templates[] = { [hwmon_curr_enable] = "curr%d_enable", [hwmon_curr_input] = "curr%d_input", [hwmon_curr_min] = "curr%d_min", [hwmon_curr_max] = "curr%d_max", [hwmon_curr_lcrit] = "curr%d_lcrit", [hwmon_curr_crit] = "curr%d_crit", [hwmon_curr_average] = "curr%d_average", [hwmon_curr_lowest] = "curr%d_lowest", [hwmon_curr_highest] = "curr%d_highest", [hwmon_curr_reset_history] = "curr%d_reset_history", [hwmon_curr_label] = "curr%d_label", [hwmon_curr_alarm] = "curr%d_alarm", [hwmon_curr_min_alarm] = "curr%d_min_alarm", [hwmon_curr_max_alarm] = "curr%d_max_alarm", [hwmon_curr_lcrit_alarm] = "curr%d_lcrit_alarm", [hwmon_curr_crit_alarm] = "curr%d_crit_alarm", [hwmon_curr_rated_min] = "curr%d_rated_min", [hwmon_curr_rated_max] = "curr%d_rated_max", [hwmon_curr_beep] = "curr%d_beep", }; static const char * const hwmon_power_attr_templates[] = { [hwmon_power_enable] = "power%d_enable", [hwmon_power_average] = "power%d_average", [hwmon_power_average_interval] = "power%d_average_interval", [hwmon_power_average_interval_max] = "power%d_interval_max", [hwmon_power_average_interval_min] = "power%d_interval_min", [hwmon_power_average_highest] = "power%d_average_highest", [hwmon_power_average_lowest] = "power%d_average_lowest", [hwmon_power_average_max] = "power%d_average_max", [hwmon_power_average_min] = "power%d_average_min", [hwmon_power_input] = "power%d_input", [hwmon_power_input_highest] = "power%d_input_highest", [hwmon_power_input_lowest] = "power%d_input_lowest", [hwmon_power_reset_history] = "power%d_reset_history", [hwmon_power_accuracy] = "power%d_accuracy", [hwmon_power_cap] = "power%d_cap", [hwmon_power_cap_hyst] = "power%d_cap_hyst", [hwmon_power_cap_max] = "power%d_cap_max", [hwmon_power_cap_min] = "power%d_cap_min", [hwmon_power_min] = "power%d_min", [hwmon_power_max] = "power%d_max", [hwmon_power_lcrit] = "power%d_lcrit", [hwmon_power_crit] = "power%d_crit", [hwmon_power_label] = "power%d_label", [hwmon_power_alarm] = "power%d_alarm", [hwmon_power_cap_alarm] = "power%d_cap_alarm", [hwmon_power_min_alarm] = "power%d_min_alarm", [hwmon_power_max_alarm] = "power%d_max_alarm", [hwmon_power_lcrit_alarm] = "power%d_lcrit_alarm", [hwmon_power_crit_alarm] = "power%d_crit_alarm", [hwmon_power_rated_min] = "power%d_rated_min", [hwmon_power_rated_max] = "power%d_rated_max", }; static const char * const hwmon_energy_attr_templates[] = { [hwmon_energy_enable] = "energy%d_enable", [hwmon_energy_input] = "energy%d_input", [hwmon_energy_label] = "energy%d_label", }; static const char * const hwmon_humidity_attr_templates[] = { [hwmon_humidity_enable] = "humidity%d_enable", [hwmon_humidity_input] = "humidity%d_input", [hwmon_humidity_label] = "humidity%d_label", [hwmon_humidity_min] = "humidity%d_min", [hwmon_humidity_min_hyst] = "humidity%d_min_hyst", [hwmon_humidity_max] = "humidity%d_max", [hwmon_humidity_max_hyst] = "humidity%d_max_hyst", [hwmon_humidity_alarm] = "humidity%d_alarm", [hwmon_humidity_fault] = "humidity%d_fault", [hwmon_humidity_rated_min] = "humidity%d_rated_min", [hwmon_humidity_rated_max] = "humidity%d_rated_max", [hwmon_humidity_min_alarm] = "humidity%d_min_alarm", [hwmon_humidity_max_alarm] = "humidity%d_max_alarm", }; static const char * const hwmon_fan_attr_templates[] = { [hwmon_fan_enable] = "fan%d_enable", [hwmon_fan_input] = "fan%d_input", [hwmon_fan_label] = "fan%d_label", [hwmon_fan_min] = "fan%d_min", [hwmon_fan_max] = "fan%d_max", [hwmon_fan_div] = "fan%d_div", [hwmon_fan_pulses] = "fan%d_pulses", [hwmon_fan_target] = "fan%d_target", [hwmon_fan_alarm] = "fan%d_alarm", [hwmon_fan_min_alarm] = "fan%d_min_alarm", [hwmon_fan_max_alarm] = "fan%d_max_alarm", [hwmon_fan_fault] = "fan%d_fault", [hwmon_fan_beep] = "fan%d_beep", }; static const char * const hwmon_pwm_attr_templates[] = { [hwmon_pwm_input] = "pwm%d", [hwmon_pwm_enable] = "pwm%d_enable", [hwmon_pwm_mode] = "pwm%d_mode", [hwmon_pwm_freq] = "pwm%d_freq", [hwmon_pwm_auto_channels_temp] = "pwm%d_auto_channels_temp", }; static const char * const hwmon_intrusion_attr_templates[] = { [hwmon_intrusion_alarm] = "intrusion%d_alarm", [hwmon_intrusion_beep] = "intrusion%d_beep", }; static const char * const *__templates[] = { [hwmon_chip] = hwmon_chip_attrs, [hwmon_temp] = hwmon_temp_attr_templates, [hwmon_in] = hwmon_in_attr_templates, [hwmon_curr] = hwmon_curr_attr_templates, [hwmon_power] = hwmon_power_attr_templates, [hwmon_energy] = hwmon_energy_attr_templates, [hwmon_humidity] = hwmon_humidity_attr_templates, [hwmon_fan] = hwmon_fan_attr_templates, [hwmon_pwm] = hwmon_pwm_attr_templates, [hwmon_intrusion] = hwmon_intrusion_attr_templates, }; static const int __templates_size[] = { [hwmon_chip] = ARRAY_SIZE(hwmon_chip_attrs), [hwmon_temp] = ARRAY_SIZE(hwmon_temp_attr_templates), [hwmon_in] = ARRAY_SIZE(hwmon_in_attr_templates), [hwmon_curr] = ARRAY_SIZE(hwmon_curr_attr_templates), [hwmon_power] = ARRAY_SIZE(hwmon_power_attr_templates), [hwmon_energy] = ARRAY_SIZE(hwmon_energy_attr_templates), [hwmon_humidity] = ARRAY_SIZE(hwmon_humidity_attr_templates), [hwmon_fan] = ARRAY_SIZE(hwmon_fan_attr_templates), [hwmon_pwm] = ARRAY_SIZE(hwmon_pwm_attr_templates), [hwmon_intrusion] = ARRAY_SIZE(hwmon_intrusion_attr_templates), }; int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel) { char event[MAX_SYSFS_ATTR_NAME_LENGTH + 5]; char sattr[MAX_SYSFS_ATTR_NAME_LENGTH]; char *envp[] = { event, NULL }; const char * const *templates; const char *template; int base; if (type >= ARRAY_SIZE(__templates)) return -EINVAL; if (attr >= __templates_size[type]) return -EINVAL; templates = __templates[type]; template = templates[attr]; base = hwmon_attr_base(type); scnprintf(sattr, MAX_SYSFS_ATTR_NAME_LENGTH, template, base + channel); scnprintf(event, sizeof(event), "NAME=%s", sattr); sysfs_notify(&dev->kobj, NULL, sattr); kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp); if (type == hwmon_temp) hwmon_thermal_notify(dev, channel); return 0; } EXPORT_SYMBOL_GPL(hwmon_notify_event); static int hwmon_num_channel_attrs(const struct hwmon_channel_info *info) { int i, n; for (i = n = 0; info->config[i]; i++) n += hweight32(info->config[i]); return n; } static int hwmon_genattrs(const void *drvdata, struct attribute **attrs, const struct hwmon_ops *ops, const struct hwmon_channel_info *info) { const char * const *templates; int template_size; int i, aindex = 0; if (info->type >= ARRAY_SIZE(__templates)) return -EINVAL; templates = __templates[info->type]; template_size = __templates_size[info->type]; for (i = 0; info->config[i]; i++) { u32 attr_mask = info->config[i]; u32 attr; while (attr_mask) { struct attribute *a; attr = __ffs(attr_mask); attr_mask &= ~BIT(attr); if (attr >= template_size) return -EINVAL; a = hwmon_genattr(drvdata, info->type, attr, i, templates[attr], ops); if (IS_ERR(a)) { if (PTR_ERR(a) != -ENOENT) return PTR_ERR(a); continue; } attrs[aindex++] = a; } } return aindex; } static struct attribute ** __hwmon_create_attrs(const void *drvdata, const struct hwmon_chip_info *chip) { int ret, i, aindex = 0, nattrs = 0; struct attribute **attrs; for (i = 0; chip->info[i]; i++) nattrs += hwmon_num_channel_attrs(chip->info[i]); if (nattrs == 0) return ERR_PTR(-EINVAL); attrs = kcalloc(nattrs + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) return ERR_PTR(-ENOMEM); for (i = 0; chip->info[i]; i++) { ret = hwmon_genattrs(drvdata, &attrs[aindex], chip->ops, chip->info[i]); if (ret < 0) { hwmon_free_attrs(attrs); return ERR_PTR(ret); } aindex += ret; } return attrs; } static struct device * __hwmon_device_register(struct device *dev, const char *name, void *drvdata, const struct hwmon_chip_info *chip, const struct attribute_group **groups) { struct hwmon_device *hwdev; const char *label; struct device *hdev; struct device *tdev = dev; int i, err, id; /* Complain about invalid characters in hwmon name attribute */ if (name && (!strlen(name) || strpbrk(name, "-* \t\n"))) dev_warn(dev, "hwmon: '%s' is not a valid name attribute, please fix\n", name); id = ida_alloc(&hwmon_ida, GFP_KERNEL); if (id < 0) return ERR_PTR(id); hwdev = kzalloc(sizeof(*hwdev), GFP_KERNEL); if (hwdev == NULL) { err = -ENOMEM; goto ida_remove; } hdev = &hwdev->dev; if (chip) { struct attribute **attrs; int ngroups = 2; /* terminating NULL plus &hwdev->groups */ if (groups) for (i = 0; groups[i]; i++) ngroups++; hwdev->groups = kcalloc(ngroups, sizeof(*groups), GFP_KERNEL); if (!hwdev->groups) { err = -ENOMEM; goto free_hwmon; } attrs = __hwmon_create_attrs(drvdata, chip); if (IS_ERR(attrs)) { err = PTR_ERR(attrs); goto free_hwmon; } hwdev->group.attrs = attrs; ngroups = 0; hwdev->groups[ngroups++] = &hwdev->group; if (groups) { for (i = 0; groups[i]; i++) hwdev->groups[ngroups++] = groups[i]; } hdev->groups = hwdev->groups; } else { hdev->groups = groups; } if (dev && device_property_present(dev, "label")) { err = device_property_read_string(dev, "label", &label); if (err < 0) goto free_hwmon; hwdev->label = kstrdup(label, GFP_KERNEL); if (hwdev->label == NULL) { err = -ENOMEM; goto free_hwmon; } } hwdev->name = name; hdev->class = &hwmon_class; hdev->parent = dev; while (tdev && !tdev->of_node) tdev = tdev->parent; hdev->of_node = tdev ? tdev->of_node : NULL; hwdev->chip = chip; dev_set_drvdata(hdev, drvdata); dev_set_name(hdev, HWMON_ID_FORMAT, id); err = device_register(hdev); if (err) { put_device(hdev); goto ida_remove; } INIT_LIST_HEAD(&hwdev->tzdata); if (hdev->of_node && chip && chip->ops->read && chip->info[0]->type == hwmon_chip && (chip->info[0]->config[0] & HWMON_C_REGISTER_TZ)) { err = hwmon_thermal_register_sensors(hdev); if (err) { device_unregister(hdev); /* * Don't worry about hwdev; hwmon_dev_release(), called * from device_unregister(), will free it. */ goto ida_remove; } } return hdev; free_hwmon: hwmon_dev_release(hdev); ida_remove: ida_free(&hwmon_ida, id); return ERR_PTR(err); } /** * hwmon_device_register_with_groups - register w/ hwmon * @dev: the parent device * @name: hwmon name attribute * @drvdata: driver data to attach to created device * @groups: List of attribute groups to create * * hwmon_device_unregister() must be called when the device is no * longer needed. * * Returns the pointer to the new device. */ struct device * hwmon_device_register_with_groups(struct device *dev, const char *name, void *drvdata, const struct attribute_group **groups) { if (!name) return ERR_PTR(-EINVAL); return __hwmon_device_register(dev, name, drvdata, NULL, groups); } EXPORT_SYMBOL_GPL(hwmon_device_register_with_groups); /** * hwmon_device_register_with_info - register w/ hwmon * @dev: the parent device (mandatory) * @name: hwmon name attribute (mandatory) * @drvdata: driver data to attach to created device (optional) * @chip: pointer to hwmon chip information (mandatory) * @extra_groups: pointer to list of additional non-standard attribute groups * (optional) * * hwmon_device_unregister() must be called when the device is no * longer needed. * * Returns the pointer to the new device. */ struct device * hwmon_device_register_with_info(struct device *dev, const char *name, void *drvdata, const struct hwmon_chip_info *chip, const struct attribute_group **extra_groups) { if (!dev || !name || !chip) return ERR_PTR(-EINVAL); if (!chip->ops || !chip->ops->is_visible || !chip->info) return ERR_PTR(-EINVAL); return __hwmon_device_register(dev, name, drvdata, chip, extra_groups); } EXPORT_SYMBOL_GPL(hwmon_device_register_with_info); /** * hwmon_device_register_for_thermal - register hwmon device for thermal subsystem * @dev: the parent device * @name: hwmon name attribute * @drvdata: driver data to attach to created device * * The use of this function is restricted. It is provided for legacy reasons * and must only be called from the thermal subsystem. * * hwmon_device_unregister() must be called when the device is no * longer needed. * * Returns the pointer to the new device. */ struct device * hwmon_device_register_for_thermal(struct device *dev, const char *name, void *drvdata) { if (!name || !dev) return ERR_PTR(-EINVAL); return __hwmon_device_register(dev, name, drvdata, NULL, NULL); } EXPORT_SYMBOL_NS_GPL(hwmon_device_register_for_thermal, HWMON_THERMAL); /** * hwmon_device_register - register w/ hwmon * @dev: the device to register * * hwmon_device_unregister() must be called when the device is no * longer needed. * * Returns the pointer to the new device. */ struct device *hwmon_device_register(struct device *dev) { dev_warn(dev, "hwmon_device_register() is deprecated. Please convert the driver to use hwmon_device_register_with_info().\n"); return __hwmon_device_register(dev, NULL, NULL, NULL, NULL); } EXPORT_SYMBOL_GPL(hwmon_device_register); /** * hwmon_device_unregister - removes the previously registered class device * * @dev: the class device to destroy */ void hwmon_device_unregister(struct device *dev) { int id; if (likely(sscanf(dev_name(dev), HWMON_ID_FORMAT, &id) == 1)) { device_unregister(dev); ida_free(&hwmon_ida, id); } else dev_dbg(dev->parent, "hwmon_device_unregister() failed: bad class ID!\n"); } EXPORT_SYMBOL_GPL(hwmon_device_unregister); static void devm_hwmon_release(struct device *dev, void *res) { struct device *hwdev = *(struct device **)res; hwmon_device_unregister(hwdev); } /** * devm_hwmon_device_register_with_groups - register w/ hwmon * @dev: the parent device * @name: hwmon name attribute * @drvdata: driver data to attach to created device * @groups: List of attribute groups to create * * Returns the pointer to the new device. The new device is automatically * unregistered with the parent device. */ struct device * devm_hwmon_device_register_with_groups(struct device *dev, const char *name, void *drvdata, const struct attribute_group **groups) { struct device **ptr, *hwdev; if (!dev) return ERR_PTR(-EINVAL); ptr = devres_alloc(devm_hwmon_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); hwdev = hwmon_device_register_with_groups(dev, name, drvdata, groups); if (IS_ERR(hwdev)) goto error; *ptr = hwdev; devres_add(dev, ptr); return hwdev; error: devres_free(ptr); return hwdev; } EXPORT_SYMBOL_GPL(devm_hwmon_device_register_with_groups); /** * devm_hwmon_device_register_with_info - register w/ hwmon * @dev: the parent device * @name: hwmon name attribute * @drvdata: driver data to attach to created device * @chip: pointer to hwmon chip information * @extra_groups: pointer to list of driver specific attribute groups * * Returns the pointer to the new device. The new device is automatically * unregistered with the parent device. */ struct device * devm_hwmon_device_register_with_info(struct device *dev, const char *name, void *drvdata, const struct hwmon_chip_info *chip, const struct attribute_group **extra_groups) { struct device **ptr, *hwdev; if (!dev) return ERR_PTR(-EINVAL); ptr = devres_alloc(devm_hwmon_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); hwdev = hwmon_device_register_with_info(dev, name, drvdata, chip, extra_groups); if (IS_ERR(hwdev)) goto error; *ptr = hwdev; devres_add(dev, ptr); return hwdev; error: devres_free(ptr); return hwdev; } EXPORT_SYMBOL_GPL(devm_hwmon_device_register_with_info); static int devm_hwmon_match(struct device *dev, void *res, void *data) { struct device **hwdev = res; return *hwdev == data; } /** * devm_hwmon_device_unregister - removes a previously registered hwmon device * * @dev: the parent device of the device to unregister */ void devm_hwmon_device_unregister(struct device *dev) { WARN_ON(devres_release(dev, devm_hwmon_release, devm_hwmon_match, dev)); } EXPORT_SYMBOL_GPL(devm_hwmon_device_unregister); static char *__hwmon_sanitize_name(struct device *dev, const char *old_name) { char *name, *p; if (dev) name = devm_kstrdup(dev, old_name, GFP_KERNEL); else name = kstrdup(old_name, GFP_KERNEL); if (!name) return ERR_PTR(-ENOMEM); for (p = name; *p; p++) if (hwmon_is_bad_char(*p)) *p = '_'; return name; } /** * hwmon_sanitize_name - Replaces invalid characters in a hwmon name * @name: NUL-terminated name * * Allocates a new string where any invalid characters will be replaced * by an underscore. It is the responsibility of the caller to release * the memory. * * Returns newly allocated name, or ERR_PTR on error. */ char *hwmon_sanitize_name(const char *name) { return __hwmon_sanitize_name(NULL, name); } EXPORT_SYMBOL_GPL(hwmon_sanitize_name); /** * devm_hwmon_sanitize_name - resource managed hwmon_sanitize_name() * @dev: device to allocate memory for * @name: NUL-terminated name * * Allocates a new string where any invalid characters will be replaced * by an underscore. * * Returns newly allocated name, or ERR_PTR on error. */ char *devm_hwmon_sanitize_name(struct device *dev, const char *name) { if (!dev) return ERR_PTR(-EINVAL); return __hwmon_sanitize_name(dev, name); } EXPORT_SYMBOL_GPL(devm_hwmon_sanitize_name); static void __init hwmon_pci_quirks(void) { #if defined CONFIG_X86 && defined CONFIG_PCI struct pci_dev *sb; u16 base; u8 enable; /* Open access to 0x295-0x296 on MSI MS-7031 */ sb = pci_get_device(PCI_VENDOR_ID_ATI, 0x436c, NULL); if (sb) { if (sb->subsystem_vendor == 0x1462 && /* MSI */ sb->subsystem_device == 0x0031) { /* MS-7031 */ pci_read_config_byte(sb, 0x48, &enable); pci_read_config_word(sb, 0x64, &base); if (base == 0 && !(enable & BIT(2))) { dev_info(&sb->dev, "Opening wide generic port at 0x295\n"); pci_write_config_word(sb, 0x64, 0x295); pci_write_config_byte(sb, 0x48, enable | BIT(2)); } } pci_dev_put(sb); } #endif } static int __init hwmon_init(void) { int err; hwmon_pci_quirks(); err = class_register(&hwmon_class); if (err) { pr_err("couldn't register hwmon sysfs class\n"); return err; } return 0; } static void __exit hwmon_exit(void) { class_unregister(&hwmon_class); } subsys_initcall(hwmon_init); module_exit(hwmon_exit); MODULE_AUTHOR("Mark M. Hoffman <mhoffman@lightlink.com>"); MODULE_DESCRIPTION("hardware monitoring sysfs/class support"); MODULE_LICENSE("GPL"); |
14 14 11 11 11 11 11 6 5 4 5 5 5 5 81 82 80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ /* just for IFNAMSIZ */ #include <linux/if.h> #include <linux/slab.h> #include <linux/export.h> #include "led.h" void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { if (!atomic_read(&local->assoc_led_active)) return; if (associated) led_trigger_event(&local->assoc_led, LED_FULL); else led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { if (!atomic_read(&local->radio_led_active)) return; if (enabled) led_trigger_event(&local->radio_led, LED_FULL); else led_trigger_event(&local->radio_led, LED_OFF); } void ieee80211_alloc_led_names(struct ieee80211_local *local) { local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", wiphy_name(local->hw.wiphy)); local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", wiphy_name(local->hw.wiphy)); local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", wiphy_name(local->hw.wiphy)); local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", wiphy_name(local->hw.wiphy)); } void ieee80211_free_led_names(struct ieee80211_local *local) { kfree(local->rx_led.name); kfree(local->tx_led.name); kfree(local->assoc_led.name); kfree(local->radio_led.name); } static int ieee80211_tx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_inc(&local->tx_led_active); return 0; } static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_dec(&local->tx_led_active); } static int ieee80211_rx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_inc(&local->rx_led_active); return 0; } static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_dec(&local->rx_led_active); } static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_inc(&local->assoc_led_active); return 0; } static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_dec(&local->assoc_led_active); } static int ieee80211_radio_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_inc(&local->radio_led_active); return 0; } static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_dec(&local->radio_led_active); } static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_inc(&local->tpt_led_active); return 0; } static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { atomic_set(&local->rx_led_active, 0); local->rx_led.activate = ieee80211_rx_led_activate; local->rx_led.deactivate = ieee80211_rx_led_deactivate; if (local->rx_led.name && led_trigger_register(&local->rx_led)) { kfree(local->rx_led.name); local->rx_led.name = NULL; } atomic_set(&local->tx_led_active, 0); local->tx_led.activate = ieee80211_tx_led_activate; local->tx_led.deactivate = ieee80211_tx_led_deactivate; if (local->tx_led.name && led_trigger_register(&local->tx_led)) { kfree(local->tx_led.name); local->tx_led.name = NULL; } atomic_set(&local->assoc_led_active, 0); local->assoc_led.activate = ieee80211_assoc_led_activate; local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { kfree(local->assoc_led.name); local->assoc_led.name = NULL; } atomic_set(&local->radio_led_active, 0); local->radio_led.activate = ieee80211_radio_led_activate; local->radio_led.deactivate = ieee80211_radio_led_deactivate; if (local->radio_led.name && led_trigger_register(&local->radio_led)) { kfree(local->radio_led.name); local->radio_led.name = NULL; } atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { local->tpt_led.activate = ieee80211_tpt_led_activate; local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } } } void ieee80211_led_exit(struct ieee80211_local *local) { if (local->radio_led.name) led_trigger_unregister(&local->radio_led); if (local->assoc_led.name) led_trigger_unregister(&local->assoc_led); if (local->tx_led.name) led_trigger_unregister(&local->tx_led); if (local->rx_led.name) led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); static unsigned long tpt_trig_traffic(struct ieee80211_local *local, struct tpt_led_trigger *tpt_trig) { unsigned long traffic, delta; traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes; delta = traffic - tpt_trig->prev_traffic; tpt_trig->prev_traffic = traffic; return DIV_ROUND_UP(delta, 1024 / 8); } static void tpt_trig_timer(struct timer_list *t) { struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); struct ieee80211_local *local = tpt_trig->local; unsigned long on, off, tpt; int i; if (!tpt_trig->running) return; mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); tpt = tpt_trig_traffic(local, tpt_trig); /* default to just solid on */ on = 1; off = 0; for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) { if (tpt_trig->blink_table[i].throughput < 0 || tpt > tpt_trig->blink_table[i].throughput) { off = tpt_trig->blink_table[i].blink_time / 2; on = tpt_trig->blink_table[i].blink_time - off; break; } } led_trigger_blink(&local->tpt_led, on, off); } const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; if (WARN_ON(local->tpt_led_trigger)) return NULL; tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL); if (!tpt_trig) return NULL; snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; tpt_trig->want = flags; tpt_trig->local = local; timer_setup(&tpt_trig->timer, tpt_trig_timer, 0); local->tpt_led_trigger = tpt_trig; return tpt_trig->name; } EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger); static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (tpt_trig->running) return; /* reset traffic */ tpt_trig_traffic(local, tpt_trig); tpt_trig->running = true; tpt_trig_timer(&tpt_trig->timer); mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); } static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (!tpt_trig->running) return; tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); led_trigger_event(&local->tpt_led, LED_OFF); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; bool allowed; WARN_ON(types_on & types_off); if (!tpt_trig) return; tpt_trig->active &= ~types_off; tpt_trig->active |= types_on; /* * Regardless of wanted state, we shouldn't blink when * the radio is disabled -- this can happen due to some * code ordering issues with __ieee80211_recalc_idle() * being called before the radio is started. */ allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO; if (!allowed || !(tpt_trig->active & tpt_trig->want)) ieee80211_stop_tpt_led_trig(local); else ieee80211_start_tpt_led_trig(local); } |
3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 | // SPDX-License-Identifier: GPL-2.0+ /* * HID driver for gaming keys on Logitech gaming keyboards (such as the G15) * * Copyright (c) 2019 Hans de Goede <hdegoede@redhat.com> */ #include <linux/device.h> #include <linux/hid.h> #include <linux/leds.h> #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/usb.h> #include <linux/wait.h> #include "hid-ids.h" #define LG_G15_TRANSFER_BUF_SIZE 20 #define LG_G15_FEATURE_REPORT 0x02 #define LG_G510_FEATURE_M_KEYS_LEDS 0x04 #define LG_G510_FEATURE_BACKLIGHT_RGB 0x05 #define LG_G510_FEATURE_POWER_ON_RGB 0x06 enum lg_g15_model { LG_G15, LG_G15_V2, LG_G510, LG_G510_USB_AUDIO, LG_Z10, }; enum lg_g15_led_type { LG_G15_KBD_BRIGHTNESS, LG_G15_LCD_BRIGHTNESS, LG_G15_BRIGHTNESS_MAX, LG_G15_MACRO_PRESET1 = 2, LG_G15_MACRO_PRESET2, LG_G15_MACRO_PRESET3, LG_G15_MACRO_RECORD, LG_G15_LED_MAX }; struct lg_g15_led { struct led_classdev cdev; enum led_brightness brightness; enum lg_g15_led_type led; u8 red, green, blue; }; struct lg_g15_data { /* Must be first for proper dma alignment */ u8 transfer_buf[LG_G15_TRANSFER_BUF_SIZE]; /* Protects the transfer_buf and led brightness */ struct mutex mutex; struct work_struct work; struct input_dev *input; struct hid_device *hdev; enum lg_g15_model model; struct lg_g15_led leds[LG_G15_LED_MAX]; bool game_mode_enabled; }; /******** G15 and G15 v2 LED functions ********/ static int lg_g15_update_led_brightness(struct lg_g15_data *g15) { int ret; ret = hid_hw_raw_request(g15->hdev, LG_G15_FEATURE_REPORT, g15->transfer_buf, 4, HID_FEATURE_REPORT, HID_REQ_GET_REPORT); if (ret != 4) { hid_err(g15->hdev, "Error getting LED brightness: %d\n", ret); return (ret < 0) ? ret : -EIO; } g15->leds[LG_G15_KBD_BRIGHTNESS].brightness = g15->transfer_buf[1]; g15->leds[LG_G15_LCD_BRIGHTNESS].brightness = g15->transfer_buf[2]; g15->leds[LG_G15_MACRO_PRESET1].brightness = !(g15->transfer_buf[3] & 0x01); g15->leds[LG_G15_MACRO_PRESET2].brightness = !(g15->transfer_buf[3] & 0x02); g15->leds[LG_G15_MACRO_PRESET3].brightness = !(g15->transfer_buf[3] & 0x04); g15->leds[LG_G15_MACRO_RECORD].brightness = !(g15->transfer_buf[3] & 0x08); return 0; } static enum led_brightness lg_g15_led_get(struct led_classdev *led_cdev) { struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); struct lg_g15_data *g15 = dev_get_drvdata(led_cdev->dev->parent); enum led_brightness brightness; mutex_lock(&g15->mutex); lg_g15_update_led_brightness(g15); brightness = g15->leds[g15_led->led].brightness; mutex_unlock(&g15->mutex); return brightness; } static int lg_g15_led_set(struct led_classdev *led_cdev, enum led_brightness brightness) { struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); struct lg_g15_data *g15 = dev_get_drvdata(led_cdev->dev->parent); u8 val, mask = 0; int i, ret; /* Ignore LED off on unregister / keyboard unplug */ if (led_cdev->flags & LED_UNREGISTERING) return 0; mutex_lock(&g15->mutex); g15->transfer_buf[0] = LG_G15_FEATURE_REPORT; g15->transfer_buf[3] = 0; if (g15_led->led < LG_G15_BRIGHTNESS_MAX) { g15->transfer_buf[1] = g15_led->led + 1; g15->transfer_buf[2] = brightness << (g15_led->led * 4); } else { for (i = LG_G15_MACRO_PRESET1; i < LG_G15_LED_MAX; i++) { if (i == g15_led->led) val = brightness; else val = g15->leds[i].brightness; if (val) mask |= 1 << (i - LG_G15_MACRO_PRESET1); } g15->transfer_buf[1] = 0x04; g15->transfer_buf[2] = ~mask; } ret = hid_hw_raw_request(g15->hdev, LG_G15_FEATURE_REPORT, g15->transfer_buf, 4, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); if (ret == 4) { /* Success */ g15_led->brightness = brightness; ret = 0; } else { hid_err(g15->hdev, "Error setting LED brightness: %d\n", ret); ret = (ret < 0) ? ret : -EIO; } mutex_unlock(&g15->mutex); return ret; } static void lg_g15_leds_changed_work(struct work_struct *work) { struct lg_g15_data *g15 = container_of(work, struct lg_g15_data, work); enum led_brightness old_brightness[LG_G15_BRIGHTNESS_MAX]; enum led_brightness brightness[LG_G15_BRIGHTNESS_MAX]; int i, ret; mutex_lock(&g15->mutex); for (i = 0; i < LG_G15_BRIGHTNESS_MAX; i++) old_brightness[i] = g15->leds[i].brightness; ret = lg_g15_update_led_brightness(g15); for (i = 0; i < LG_G15_BRIGHTNESS_MAX; i++) brightness[i] = g15->leds[i].brightness; mutex_unlock(&g15->mutex); if (ret) return; for (i = 0; i < LG_G15_BRIGHTNESS_MAX; i++) { if (brightness[i] == old_brightness[i]) continue; led_classdev_notify_brightness_hw_changed(&g15->leds[i].cdev, brightness[i]); } } /******** G510 LED functions ********/ static int lg_g510_get_initial_led_brightness(struct lg_g15_data *g15, int i) { int ret, high; ret = hid_hw_raw_request(g15->hdev, LG_G510_FEATURE_BACKLIGHT_RGB + i, g15->transfer_buf, 4, HID_FEATURE_REPORT, HID_REQ_GET_REPORT); if (ret != 4) { hid_err(g15->hdev, "Error getting LED brightness: %d\n", ret); return (ret < 0) ? ret : -EIO; } high = max3(g15->transfer_buf[1], g15->transfer_buf[2], g15->transfer_buf[3]); if (high) { g15->leds[i].red = DIV_ROUND_CLOSEST(g15->transfer_buf[1] * 255, high); g15->leds[i].green = DIV_ROUND_CLOSEST(g15->transfer_buf[2] * 255, high); g15->leds[i].blue = DIV_ROUND_CLOSEST(g15->transfer_buf[3] * 255, high); g15->leds[i].brightness = high; } else { g15->leds[i].red = 255; g15->leds[i].green = 255; g15->leds[i].blue = 255; g15->leds[i].brightness = 0; } return 0; } /* Must be called with g15->mutex locked */ static int lg_g510_kbd_led_write(struct lg_g15_data *g15, struct lg_g15_led *g15_led, enum led_brightness brightness) { int ret; g15->transfer_buf[0] = 5 + g15_led->led; g15->transfer_buf[1] = DIV_ROUND_CLOSEST(g15_led->red * brightness, 255); g15->transfer_buf[2] = DIV_ROUND_CLOSEST(g15_led->green * brightness, 255); g15->transfer_buf[3] = DIV_ROUND_CLOSEST(g15_led->blue * brightness, 255); ret = hid_hw_raw_request(g15->hdev, LG_G510_FEATURE_BACKLIGHT_RGB + g15_led->led, g15->transfer_buf, 4, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); if (ret == 4) { /* Success */ g15_led->brightness = brightness; ret = 0; } else { hid_err(g15->hdev, "Error setting LED brightness: %d\n", ret); ret = (ret < 0) ? ret : -EIO; } return ret; } static int lg_g510_kbd_led_set(struct led_classdev *led_cdev, enum led_brightness brightness) { struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); struct lg_g15_data *g15 = dev_get_drvdata(led_cdev->dev->parent); int ret; /* Ignore LED off on unregister / keyboard unplug */ if (led_cdev->flags & LED_UNREGISTERING) return 0; mutex_lock(&g15->mutex); ret = lg_g510_kbd_led_write(g15, g15_led, brightness); mutex_unlock(&g15->mutex); return ret; } static enum led_brightness lg_g510_kbd_led_get(struct led_classdev *led_cdev) { struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); return g15_led->brightness; } static ssize_t color_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); struct lg_g15_data *g15 = dev_get_drvdata(led_cdev->dev->parent); unsigned long value; int ret; if (count < 7 || (count == 8 && buf[7] != '\n') || count > 8) return -EINVAL; if (buf[0] != '#') return -EINVAL; ret = kstrtoul(buf + 1, 16, &value); if (ret) return ret; mutex_lock(&g15->mutex); g15_led->red = (value & 0xff0000) >> 16; g15_led->green = (value & 0x00ff00) >> 8; g15_led->blue = (value & 0x0000ff); ret = lg_g510_kbd_led_write(g15, g15_led, g15_led->brightness); mutex_unlock(&g15->mutex); return (ret < 0) ? ret : count; } static ssize_t color_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); struct lg_g15_data *g15 = dev_get_drvdata(led_cdev->dev->parent); ssize_t ret; mutex_lock(&g15->mutex); ret = sprintf(buf, "#%02x%02x%02x\n", g15_led->red, g15_led->green, g15_led->blue); mutex_unlock(&g15->mutex); return ret; } static DEVICE_ATTR_RW(color); static struct attribute *lg_g510_kbd_led_attrs[] = { &dev_attr_color.attr, NULL, }; static const struct attribute_group lg_g510_kbd_led_group = { .attrs = lg_g510_kbd_led_attrs, }; static const struct attribute_group *lg_g510_kbd_led_groups[] = { &lg_g510_kbd_led_group, NULL, }; static void lg_g510_leds_sync_work(struct work_struct *work) { struct lg_g15_data *g15 = container_of(work, struct lg_g15_data, work); mutex_lock(&g15->mutex); lg_g510_kbd_led_write(g15, &g15->leds[LG_G15_KBD_BRIGHTNESS], g15->leds[LG_G15_KBD_BRIGHTNESS].brightness); mutex_unlock(&g15->mutex); } static int lg_g510_update_mkey_led_brightness(struct lg_g15_data *g15) { int ret; ret = hid_hw_raw_request(g15->hdev, LG_G510_FEATURE_M_KEYS_LEDS, g15->transfer_buf, 2, HID_FEATURE_REPORT, HID_REQ_GET_REPORT); if (ret != 2) { hid_err(g15->hdev, "Error getting LED brightness: %d\n", ret); ret = (ret < 0) ? ret : -EIO; } g15->leds[LG_G15_MACRO_PRESET1].brightness = !!(g15->transfer_buf[1] & 0x80); g15->leds[LG_G15_MACRO_PRESET2].brightness = !!(g15->transfer_buf[1] & 0x40); g15->leds[LG_G15_MACRO_PRESET3].brightness = !!(g15->transfer_buf[1] & 0x20); g15->leds[LG_G15_MACRO_RECORD].brightness = !!(g15->transfer_buf[1] & 0x10); return 0; } static enum led_brightness lg_g510_mkey_led_get(struct led_classdev *led_cdev) { struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); struct lg_g15_data *g15 = dev_get_drvdata(led_cdev->dev->parent); enum led_brightness brightness; mutex_lock(&g15->mutex); lg_g510_update_mkey_led_brightness(g15); brightness = g15->leds[g15_led->led].brightness; mutex_unlock(&g15->mutex); return brightness; } static int lg_g510_mkey_led_set(struct led_classdev *led_cdev, enum led_brightness brightness) { struct lg_g15_led *g15_led = container_of(led_cdev, struct lg_g15_led, cdev); struct lg_g15_data *g15 = dev_get_drvdata(led_cdev->dev->parent); u8 val, mask = 0; int i, ret; /* Ignore LED off on unregister / keyboard unplug */ if (led_cdev->flags & LED_UNREGISTERING) return 0; mutex_lock(&g15->mutex); for (i = LG_G15_MACRO_PRESET1; i < LG_G15_LED_MAX; i++) { if (i == g15_led->led) val = brightness; else val = g15->leds[i].brightness; if (val) mask |= 0x80 >> (i - LG_G15_MACRO_PRESET1); } g15->transfer_buf[0] = LG_G510_FEATURE_M_KEYS_LEDS; g15->transfer_buf[1] = mask; ret = hid_hw_raw_request(g15->hdev, LG_G510_FEATURE_M_KEYS_LEDS, g15->transfer_buf, 2, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); if (ret == 2) { /* Success */ g15_led->brightness = brightness; ret = 0; } else { hid_err(g15->hdev, "Error setting LED brightness: %d\n", ret); ret = (ret < 0) ? ret : -EIO; } mutex_unlock(&g15->mutex); return ret; } /******** Generic LED functions ********/ static int lg_g15_get_initial_led_brightness(struct lg_g15_data *g15) { int ret; switch (g15->model) { case LG_G15: case LG_G15_V2: return lg_g15_update_led_brightness(g15); case LG_G510: case LG_G510_USB_AUDIO: ret = lg_g510_get_initial_led_brightness(g15, 0); if (ret) return ret; ret = lg_g510_get_initial_led_brightness(g15, 1); if (ret) return ret; return lg_g510_update_mkey_led_brightness(g15); case LG_Z10: /* * Getting the LCD backlight brightness is not supported. * Reading Feature(2) fails with -EPIPE and this crashes * the LCD and touch keys part of the speakers. */ return 0; } return -EINVAL; /* Never reached */ } /******** Input functions ********/ /* On the G15 Mark I Logitech has been quite creative with which bit is what */ static void lg_g15_handle_lcd_menu_keys(struct lg_g15_data *g15, u8 *data) { int i, val; /* Most left (round/display) button below the LCD */ input_report_key(g15->input, KEY_KBD_LCD_MENU1, data[8] & 0x80); /* 4 other buttons below the LCD */ for (i = 0; i < 4; i++) { val = data[i + 2] & 0x80; input_report_key(g15->input, KEY_KBD_LCD_MENU2 + i, val); } } static int lg_g15_event(struct lg_g15_data *g15, u8 *data) { int i, val; /* G1 - G6 */ for (i = 0; i < 6; i++) { val = data[i + 1] & (1 << i); input_report_key(g15->input, KEY_MACRO1 + i, val); } /* G7 - G12 */ for (i = 0; i < 6; i++) { val = data[i + 2] & (1 << i); input_report_key(g15->input, KEY_MACRO7 + i, val); } /* G13 - G17 */ for (i = 0; i < 5; i++) { val = data[i + 1] & (4 << i); input_report_key(g15->input, KEY_MACRO13 + i, val); } /* G18 */ input_report_key(g15->input, KEY_MACRO18, data[8] & 0x40); /* M1 - M3 */ for (i = 0; i < 3; i++) { val = data[i + 6] & (1 << i); input_report_key(g15->input, KEY_MACRO_PRESET1 + i, val); } /* MR */ input_report_key(g15->input, KEY_MACRO_RECORD_START, data[7] & 0x40); lg_g15_handle_lcd_menu_keys(g15, data); /* Backlight cycle button pressed? */ if (data[1] & 0x80) schedule_work(&g15->work); input_sync(g15->input); return 0; } static int lg_g15_v2_event(struct lg_g15_data *g15, u8 *data) { int i, val; /* G1 - G6 */ for (i = 0; i < 6; i++) { val = data[1] & (1 << i); input_report_key(g15->input, KEY_MACRO1 + i, val); } /* M1 - M3 + MR */ input_report_key(g15->input, KEY_MACRO_PRESET1, data[1] & 0x40); input_report_key(g15->input, KEY_MACRO_PRESET2, data[1] & 0x80); input_report_key(g15->input, KEY_MACRO_PRESET3, data[2] & 0x20); input_report_key(g15->input, KEY_MACRO_RECORD_START, data[2] & 0x40); /* Round button to the left of the LCD */ input_report_key(g15->input, KEY_KBD_LCD_MENU1, data[2] & 0x80); /* 4 buttons below the LCD */ for (i = 0; i < 4; i++) { val = data[2] & (2 << i); input_report_key(g15->input, KEY_KBD_LCD_MENU2 + i, val); } /* Backlight cycle button pressed? */ if (data[2] & 0x01) schedule_work(&g15->work); input_sync(g15->input); return 0; } static int lg_g510_event(struct lg_g15_data *g15, u8 *data) { bool game_mode_enabled; int i, val; /* G1 - G18 */ for (i = 0; i < 18; i++) { val = data[i / 8 + 1] & (1 << (i % 8)); input_report_key(g15->input, KEY_MACRO1 + i, val); } /* Game mode on/off slider */ game_mode_enabled = data[3] & 0x04; if (game_mode_enabled != g15->game_mode_enabled) { if (game_mode_enabled) hid_info(g15->hdev, "Game Mode enabled, Windows (super) key is disabled\n"); else hid_info(g15->hdev, "Game Mode disabled\n"); g15->game_mode_enabled = game_mode_enabled; } /* M1 - M3 */ for (i = 0; i < 3; i++) { val = data[3] & (0x10 << i); input_report_key(g15->input, KEY_MACRO_PRESET1 + i, val); } /* MR */ input_report_key(g15->input, KEY_MACRO_RECORD_START, data[3] & 0x80); /* LCD menu keys */ for (i = 0; i < 5; i++) { val = data[4] & (1 << i); input_report_key(g15->input, KEY_KBD_LCD_MENU1 + i, val); } /* Headphone Mute */ input_report_key(g15->input, KEY_MUTE, data[4] & 0x20); /* Microphone Mute */ input_report_key(g15->input, KEY_F20, data[4] & 0x40); input_sync(g15->input); return 0; } static int lg_g510_leds_event(struct lg_g15_data *g15, u8 *data) { bool backlight_disabled; /* * The G510 ignores backlight updates when the backlight is turned off * through the light toggle button on the keyboard, to work around this * we queue a workitem to sync values when the backlight is turned on. */ backlight_disabled = data[1] & 0x04; if (!backlight_disabled) schedule_work(&g15->work); return 0; } static int lg_g15_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct lg_g15_data *g15 = hid_get_drvdata(hdev); if (!g15) return 0; switch (g15->model) { case LG_G15: if (data[0] == 0x02 && size == 9) return lg_g15_event(g15, data); break; case LG_G15_V2: if (data[0] == 0x02 && size == 5) return lg_g15_v2_event(g15, data); break; case LG_Z10: if (data[0] == 0x02 && size == 9) { lg_g15_handle_lcd_menu_keys(g15, data); input_sync(g15->input); } break; case LG_G510: case LG_G510_USB_AUDIO: if (data[0] == 0x03 && size == 5) return lg_g510_event(g15, data); if (data[0] == 0x04 && size == 2) return lg_g510_leds_event(g15, data); break; } return 0; } static int lg_g15_input_open(struct input_dev *dev) { struct hid_device *hdev = input_get_drvdata(dev); return hid_hw_open(hdev); } static void lg_g15_input_close(struct input_dev *dev) { struct hid_device *hdev = input_get_drvdata(dev); hid_hw_close(hdev); } static int lg_g15_register_led(struct lg_g15_data *g15, int i, const char *name) { g15->leds[i].led = i; g15->leds[i].cdev.name = name; switch (g15->model) { case LG_G15: case LG_G15_V2: g15->leds[i].cdev.brightness_get = lg_g15_led_get; fallthrough; case LG_Z10: g15->leds[i].cdev.brightness_set_blocking = lg_g15_led_set; if (i < LG_G15_BRIGHTNESS_MAX) { g15->leds[i].cdev.flags = LED_BRIGHT_HW_CHANGED; g15->leds[i].cdev.max_brightness = 2; } else { g15->leds[i].cdev.max_brightness = 1; } break; case LG_G510: case LG_G510_USB_AUDIO: switch (i) { case LG_G15_LCD_BRIGHTNESS: /* * The G510 does not have a separate LCD brightness, * but it does have a separate power-on (reset) value. */ g15->leds[i].cdev.name = "g15::power_on_backlight_val"; fallthrough; case LG_G15_KBD_BRIGHTNESS: g15->leds[i].cdev.brightness_set_blocking = lg_g510_kbd_led_set; g15->leds[i].cdev.brightness_get = lg_g510_kbd_led_get; g15->leds[i].cdev.max_brightness = 255; g15->leds[i].cdev.groups = lg_g510_kbd_led_groups; break; default: g15->leds[i].cdev.brightness_set_blocking = lg_g510_mkey_led_set; g15->leds[i].cdev.brightness_get = lg_g510_mkey_led_get; g15->leds[i].cdev.max_brightness = 1; } break; } return devm_led_classdev_register(&g15->hdev->dev, &g15->leds[i].cdev); } /* Common input device init code shared between keyboards and Z-10 speaker handling */ static void lg_g15_init_input_dev(struct hid_device *hdev, struct input_dev *input, const char *name) { int i; input->name = name; input->phys = hdev->phys; input->uniq = hdev->uniq; input->id.bustype = hdev->bus; input->id.vendor = hdev->vendor; input->id.product = hdev->product; input->id.version = hdev->version; input->dev.parent = &hdev->dev; input->open = lg_g15_input_open; input->close = lg_g15_input_close; /* Keys below the LCD, intended for controlling a menu on the LCD */ for (i = 0; i < 5; i++) input_set_capability(input, EV_KEY, KEY_KBD_LCD_MENU1 + i); } static int lg_g15_probe(struct hid_device *hdev, const struct hid_device_id *id) { static const char * const led_names[] = { "g15::kbd_backlight", "g15::lcd_backlight", "g15::macro_preset1", "g15::macro_preset2", "g15::macro_preset3", "g15::macro_record", }; u8 gkeys_settings_output_report = 0; u8 gkeys_settings_feature_report = 0; struct hid_report_enum *rep_enum; unsigned int connect_mask = 0; bool has_ff000000 = false; struct lg_g15_data *g15; struct input_dev *input; struct hid_report *rep; int ret, i, gkeys = 0; hdev->quirks |= HID_QUIRK_INPUT_PER_APP; ret = hid_parse(hdev); if (ret) return ret; /* * Some models have multiple interfaces, we want the interface with * the f000.0000 application input report. */ rep_enum = &hdev->report_enum[HID_INPUT_REPORT]; list_for_each_entry(rep, &rep_enum->report_list, list) { if (rep->application == 0xff000000) has_ff000000 = true; } if (!has_ff000000) return hid_hw_start(hdev, HID_CONNECT_DEFAULT); g15 = devm_kzalloc(&hdev->dev, sizeof(*g15), GFP_KERNEL); if (!g15) return -ENOMEM; mutex_init(&g15->mutex); input = devm_input_allocate_device(&hdev->dev); if (!input) return -ENOMEM; g15->hdev = hdev; g15->model = id->driver_data; g15->input = input; input_set_drvdata(input, hdev); hid_set_drvdata(hdev, (void *)g15); switch (g15->model) { case LG_G15: INIT_WORK(&g15->work, lg_g15_leds_changed_work); /* * The G15 and G15 v2 use a separate usb-device (on a builtin * hub) which emulates a keyboard for the F1 - F12 emulation * on the G-keys, which we disable, rendering the emulated kbd * non-functional, so we do not let hid-input connect. */ connect_mask = HID_CONNECT_HIDRAW; gkeys_settings_output_report = 0x02; gkeys = 18; break; case LG_G15_V2: INIT_WORK(&g15->work, lg_g15_leds_changed_work); connect_mask = HID_CONNECT_HIDRAW; gkeys_settings_output_report = 0x02; gkeys = 6; break; case LG_G510: case LG_G510_USB_AUDIO: INIT_WORK(&g15->work, lg_g510_leds_sync_work); connect_mask = HID_CONNECT_HIDINPUT | HID_CONNECT_HIDRAW; gkeys_settings_feature_report = 0x01; gkeys = 18; break; case LG_Z10: connect_mask = HID_CONNECT_HIDRAW; break; } ret = hid_hw_start(hdev, connect_mask); if (ret) return ret; /* Tell the keyboard to stop sending F1-F12 + 1-6 for G1 - G18 */ if (gkeys_settings_output_report) { g15->transfer_buf[0] = gkeys_settings_output_report; memset(g15->transfer_buf + 1, 0, gkeys); /* * The kbd ignores our output report if we do not queue * an URB on the USB input endpoint first... */ ret = hid_hw_open(hdev); if (ret) goto error_hw_stop; ret = hid_hw_output_report(hdev, g15->transfer_buf, gkeys + 1); hid_hw_close(hdev); } if (gkeys_settings_feature_report) { g15->transfer_buf[0] = gkeys_settings_feature_report; memset(g15->transfer_buf + 1, 0, gkeys); ret = hid_hw_raw_request(g15->hdev, gkeys_settings_feature_report, g15->transfer_buf, gkeys + 1, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); } if (ret < 0) { hid_err(hdev, "Error %d disabling keyboard emulation for the G-keys, falling back to generic hid-input driver\n", ret); hid_set_drvdata(hdev, NULL); return 0; } /* Get initial brightness levels */ ret = lg_g15_get_initial_led_brightness(g15); if (ret) goto error_hw_stop; if (g15->model == LG_Z10) { lg_g15_init_input_dev(hdev, g15->input, "Logitech Z-10 LCD Menu Keys"); ret = input_register_device(g15->input); if (ret) goto error_hw_stop; ret = lg_g15_register_led(g15, 1, "z-10::lcd_backlight"); if (ret) goto error_hw_stop; return 0; /* All done */ } /* Setup and register input device */ lg_g15_init_input_dev(hdev, input, "Logitech Gaming Keyboard Gaming Keys"); /* G-keys */ for (i = 0; i < gkeys; i++) input_set_capability(input, EV_KEY, KEY_MACRO1 + i); /* M1 - M3 and MR keys */ for (i = 0; i < 3; i++) input_set_capability(input, EV_KEY, KEY_MACRO_PRESET1 + i); input_set_capability(input, EV_KEY, KEY_MACRO_RECORD_START); /* * On the G510 only report headphone and mic mute keys when *not* using * the builtin USB audio device. When the builtin audio is used these * keys directly toggle mute (and the LEDs) on/off. */ if (g15->model == LG_G510) { input_set_capability(input, EV_KEY, KEY_MUTE); /* Userspace expects F20 for micmute */ input_set_capability(input, EV_KEY, KEY_F20); } ret = input_register_device(input); if (ret) goto error_hw_stop; /* Register LED devices */ for (i = 0; i < LG_G15_LED_MAX; i++) { ret = lg_g15_register_led(g15, i, led_names[i]); if (ret) goto error_hw_stop; } return 0; error_hw_stop: hid_hw_stop(hdev); return ret; } static const struct hid_device_id lg_g15_devices[] = { /* The G11 is a G15 without the LCD, treat it as a G15 */ { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G11), .driver_data = LG_G15 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G15_LCD), .driver_data = LG_G15 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G15_V2_LCD), .driver_data = LG_G15_V2 }, /* G510 without a headset plugged in */ { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G510), .driver_data = LG_G510 }, /* G510 with headset plugged in / with extra USB audio interface */ { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G510_USB_AUDIO), .driver_data = LG_G510_USB_AUDIO }, /* Z-10 speakers */ { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_Z_10_SPK), .driver_data = LG_Z10 }, { } }; MODULE_DEVICE_TABLE(hid, lg_g15_devices); static struct hid_driver lg_g15_driver = { .name = "lg-g15", .id_table = lg_g15_devices, .raw_event = lg_g15_raw_event, .probe = lg_g15_probe, }; module_hid_driver(lg_g15_driver); MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>"); MODULE_LICENSE("GPL"); |
100 99 96 18 147 101 101 19 19 19 1 17 18 17 19 6 6 6 6 2 1 5 19 19 19 19 19 145 132 132 131 22 22 19 8 67 68 39 17 27 23 23 18 6 12 60 51 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 | /* * net/tipc/msg.c: TIPC message header routines * * Copyright (c) 2000-2006, 2014-2015, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include "core.h" #include "msg.h" #include "addr.h" #include "name_table.h" #include "crypto.h" #define BUF_ALIGN(x) ALIGN(x, 4) #define MAX_FORWARD_SIZE 1024 #ifdef CONFIG_TIPC_CRYPTO #define BUF_HEADROOM ALIGN(((LL_MAX_HEADER + 48) + EHDR_MAX_SIZE), 16) #define BUF_OVERHEAD (BUF_HEADROOM + TIPC_AES_GCM_TAG_SIZE) #else #define BUF_HEADROOM (LL_MAX_HEADER + 48) #define BUF_OVERHEAD BUF_HEADROOM #endif const int one_page_mtu = PAGE_SIZE - SKB_DATA_ALIGN(BUF_OVERHEAD) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); /** * tipc_buf_acquire - creates a TIPC message buffer * @size: message size (including TIPC header) * @gfp: memory allocation flags * * Return: a new buffer with data pointers set to the specified size. * * NOTE: * Headroom is reserved to allow prepending of a data link header. * There may also be unrequested tailroom present at the buffer's end. */ struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp) { struct sk_buff *skb; skb = alloc_skb_fclone(BUF_OVERHEAD + size, gfp); if (skb) { skb_reserve(skb, BUF_HEADROOM); skb_put(skb, size); skb->next = NULL; } return skb; } void tipc_msg_init(u32 own_node, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 dnode) { memset(m, 0, hsize); msg_set_version(m); msg_set_user(m, user); msg_set_hdr_sz(m, hsize); msg_set_size(m, hsize); msg_set_prevnode(m, own_node); msg_set_type(m, type); if (hsize > SHORT_H_SIZE) { msg_set_orignode(m, own_node); msg_set_destnode(m, dnode); } } struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode) { struct tipc_msg *msg; struct sk_buff *buf; buf = tipc_buf_acquire(hdr_sz + data_sz, GFP_ATOMIC); if (unlikely(!buf)) return NULL; msg = buf_msg(buf); tipc_msg_init(onode, msg, user, type, hdr_sz, dnode); msg_set_size(msg, hdr_sz + data_sz); msg_set_origport(msg, oport); msg_set_destport(msg, dport); msg_set_errcode(msg, errcode); return buf; } /* tipc_buf_append(): Append a buffer to the fragment list of another buffer * @*headbuf: in: NULL for first frag, otherwise value returned from prev call * out: set when successful non-complete reassembly, otherwise NULL * @*buf: in: the buffer to append. Always defined * out: head buf after successful complete reassembly, otherwise NULL * Returns 1 when reassembly complete, otherwise 0 */ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) { struct sk_buff *head = *headbuf; struct sk_buff *frag = *buf; struct sk_buff *tail = NULL; struct tipc_msg *msg; u32 fragid; int delta; bool headstolen; if (!frag) goto err; msg = buf_msg(frag); fragid = msg_type(msg); frag->next = NULL; skb_pull(frag, msg_hdr_sz(msg)); if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; if (skb_has_frag_list(frag) && __skb_linearize(frag)) goto err; *buf = NULL; frag = skb_unshare(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; head = *headbuf = frag; TIPC_SKB_CB(head)->tail = NULL; return 0; } if (!head) goto err; /* Either the input skb ownership is transferred to headskb * or the input skb is freed, clear the reference to avoid * bad access on error path. */ *buf = NULL; if (skb_try_coalesce(head, frag, &headstolen, &delta)) { kfree_skb_partial(frag, headstolen); } else { tail = TIPC_SKB_CB(head)->tail; if (!skb_has_frag_list(head)) skb_shinfo(head)->frag_list = frag; else tail->next = frag; head->truesize += frag->truesize; head->data_len += frag->len; head->len += frag->len; TIPC_SKB_CB(head)->tail = frag; } if (fragid == LAST_FRAGMENT) { TIPC_SKB_CB(head)->validated = 0; if (unlikely(!tipc_msg_validate(&head))) goto err; *buf = head; TIPC_SKB_CB(head)->tail = NULL; *headbuf = NULL; return 1; } return 0; err: kfree_skb(*buf); kfree_skb(*headbuf); *buf = *headbuf = NULL; return 0; } /** * tipc_msg_append(): Append data to tail of an existing buffer queue * @_hdr: header to be used * @m: the data to be appended * @mss: max allowable size of buffer * @dlen: size of data to be appended * @txq: queue to append to * * Return: the number of 1k blocks appended or errno value */ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq) { struct sk_buff *skb; int accounted, total, curr; int mlen, cpy, rem = dlen; struct tipc_msg *hdr; skb = skb_peek_tail(txq); accounted = skb ? msg_blocks(buf_msg(skb)) : 0; total = accounted; do { if (!skb || skb->len >= mss) { skb = tipc_buf_acquire(mss, GFP_KERNEL); if (unlikely(!skb)) return -ENOMEM; skb_orphan(skb); skb_trim(skb, MIN_H_SIZE); hdr = buf_msg(skb); skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE); msg_set_hdr_sz(hdr, MIN_H_SIZE); msg_set_size(hdr, MIN_H_SIZE); __skb_queue_tail(txq, skb); total += 1; } hdr = buf_msg(skb); curr = msg_blocks(hdr); mlen = msg_size(hdr); cpy = min_t(size_t, rem, mss - mlen); if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) return -EFAULT; msg_set_size(hdr, mlen + cpy); skb_put(skb, cpy); rem -= cpy; total += msg_blocks(hdr) - curr; } while (rem > 0); return total - accounted; } /* tipc_msg_validate - validate basic format of received message * * This routine ensures a TIPC message has an acceptable header, and at least * as much data as the header indicates it should. The routine also ensures * that the entire message header is stored in the main fragment of the message * buffer, to simplify future access to message header fields. * * Note: Having extra info present in the message header or data areas is OK. * TIPC will ignore the excess, under the assumption that it is optional info * introduced by a later release of the protocol. */ bool tipc_msg_validate(struct sk_buff **_skb) { struct sk_buff *skb = *_skb; struct tipc_msg *hdr; int msz, hsz; /* Ensure that flow control ratio condition is satisfied */ if (unlikely(skb->truesize / buf_roundup_len(skb) >= 4)) { skb = skb_copy_expand(skb, BUF_HEADROOM, 0, GFP_ATOMIC); if (!skb) return false; kfree_skb(*_skb); *_skb = skb; } if (unlikely(TIPC_SKB_CB(skb)->validated)) return true; if (unlikely(!pskb_may_pull(skb, MIN_H_SIZE))) return false; hsz = msg_hdr_sz(buf_msg(skb)); if (unlikely(hsz < MIN_H_SIZE) || (hsz > MAX_H_SIZE)) return false; if (unlikely(!pskb_may_pull(skb, hsz))) return false; hdr = buf_msg(skb); if (unlikely(msg_version(hdr) != TIPC_VERSION)) return false; msz = msg_size(hdr); if (unlikely(msz < hsz)) return false; if (unlikely((msz - hsz) > TIPC_MAX_USER_MSG_SIZE)) return false; if (unlikely(skb->len < msz)) return false; TIPC_SKB_CB(skb)->validated = 1; return true; } /** * tipc_msg_fragment - build a fragment skb list for TIPC message * * @skb: TIPC message skb * @hdr: internal msg header to be put on the top of the fragments * @pktmax: max size of a fragment incl. the header * @frags: returned fragment skb list * * Return: 0 if the fragmentation is successful, otherwise: -EINVAL * or -ENOMEM */ int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags) { int pktno, nof_fragms, dsz, dmax, eat; struct tipc_msg *_hdr; struct sk_buff *_skb; u8 *data; /* Non-linear buffer? */ if (skb_linearize(skb)) return -ENOMEM; data = (u8 *)skb->data; dsz = msg_size(buf_msg(skb)); dmax = pktmax - INT_H_SIZE; if (dsz <= dmax || !dmax) return -EINVAL; nof_fragms = dsz / dmax + 1; for (pktno = 1; pktno <= nof_fragms; pktno++) { if (pktno < nof_fragms) eat = dmax; else eat = dsz % dmax; /* Allocate a new fragment */ _skb = tipc_buf_acquire(INT_H_SIZE + eat, GFP_ATOMIC); if (!_skb) goto error; skb_orphan(_skb); __skb_queue_tail(frags, _skb); /* Copy header & data to the fragment */ skb_copy_to_linear_data(_skb, hdr, INT_H_SIZE); skb_copy_to_linear_data_offset(_skb, INT_H_SIZE, data, eat); data += eat; /* Update the fragment's header */ _hdr = buf_msg(_skb); msg_set_fragm_no(_hdr, pktno); msg_set_nof_fragms(_hdr, nof_fragms); msg_set_size(_hdr, INT_H_SIZE + eat); } return 0; error: __skb_queue_purge(frags); __skb_queue_head_init(frags); return -ENOMEM; } /** * tipc_msg_build - create buffer chain containing specified header and data * @mhdr: Message header, to be prepended to data * @m: User message * @offset: buffer offset for fragmented messages (FIXME) * @dsz: Total length of user data * @pktmax: Max packet size that can be used * @list: Buffer or chain of buffers to be returned to caller * * Note that the recursive call we are making here is safe, since it can * logically go only one further level down. * * Return: message data size or errno: -ENOMEM, -EFAULT */ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int pktmax, struct sk_buff_head *list) { int mhsz = msg_hdr_sz(mhdr); struct tipc_msg pkthdr; int msz = mhsz + dsz; int pktrem = pktmax; struct sk_buff *skb; int drem = dsz; int pktno = 1; char *pktpos; int pktsz; int rc; msg_set_size(mhdr, msz); /* No fragmentation needed? */ if (likely(msz <= pktmax)) { skb = tipc_buf_acquire(msz, GFP_KERNEL); /* Fall back to smaller MTU if node local message */ if (unlikely(!skb)) { if (pktmax != MAX_MSG_SIZE) return -ENOMEM; rc = tipc_msg_build(mhdr, m, offset, dsz, one_page_mtu, list); if (rc != dsz) return rc; if (tipc_msg_assemble(list)) return dsz; return -ENOMEM; } skb_orphan(skb); __skb_queue_tail(list, skb); skb_copy_to_linear_data(skb, mhdr, mhsz); pktpos = skb->data + mhsz; if (copy_from_iter_full(pktpos, dsz, &m->msg_iter)) return dsz; rc = -EFAULT; goto error; } /* Prepare reusable fragment header */ tipc_msg_init(msg_prevnode(mhdr), &pkthdr, MSG_FRAGMENTER, FIRST_FRAGMENT, INT_H_SIZE, msg_destnode(mhdr)); msg_set_size(&pkthdr, pktmax); msg_set_fragm_no(&pkthdr, pktno); msg_set_importance(&pkthdr, msg_importance(mhdr)); /* Prepare first fragment */ skb = tipc_buf_acquire(pktmax, GFP_KERNEL); if (!skb) return -ENOMEM; skb_orphan(skb); __skb_queue_tail(list, skb); pktpos = skb->data; skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE); pktpos += INT_H_SIZE; pktrem -= INT_H_SIZE; skb_copy_to_linear_data_offset(skb, INT_H_SIZE, mhdr, mhsz); pktpos += mhsz; pktrem -= mhsz; do { if (drem < pktrem) pktrem = drem; if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) { rc = -EFAULT; goto error; } drem -= pktrem; if (!drem) break; /* Prepare new fragment: */ if (drem < (pktmax - INT_H_SIZE)) pktsz = drem + INT_H_SIZE; else pktsz = pktmax; skb = tipc_buf_acquire(pktsz, GFP_KERNEL); if (!skb) { rc = -ENOMEM; goto error; } skb_orphan(skb); __skb_queue_tail(list, skb); msg_set_type(&pkthdr, FRAGMENT); msg_set_size(&pkthdr, pktsz); msg_set_fragm_no(&pkthdr, ++pktno); skb_copy_to_linear_data(skb, &pkthdr, INT_H_SIZE); pktpos = skb->data + INT_H_SIZE; pktrem = pktsz - INT_H_SIZE; } while (1); msg_set_type(buf_msg(skb), LAST_FRAGMENT); return dsz; error: __skb_queue_purge(list); __skb_queue_head_init(list); return rc; } /** * tipc_msg_bundle - Append contents of a buffer to tail of an existing one * @bskb: the bundle buffer to append to * @msg: message to be appended * @max: max allowable size for the bundle buffer * * Return: "true" if bundling has been performed, otherwise "false" */ static bool tipc_msg_bundle(struct sk_buff *bskb, struct tipc_msg *msg, u32 max) { struct tipc_msg *bmsg = buf_msg(bskb); u32 msz, bsz, offset, pad; msz = msg_size(msg); bsz = msg_size(bmsg); offset = BUF_ALIGN(bsz); pad = offset - bsz; if (unlikely(skb_tailroom(bskb) < (pad + msz))) return false; if (unlikely(max < (offset + msz))) return false; skb_put(bskb, pad + msz); skb_copy_to_linear_data_offset(bskb, offset, msg, msz); msg_set_size(bmsg, offset + msz); msg_set_msgcnt(bmsg, msg_msgcnt(bmsg) + 1); return true; } /** * tipc_msg_try_bundle - Try to bundle a new message to the last one * @tskb: the last/target message to which the new one will be appended * @skb: the new message skb pointer * @mss: max message size (header inclusive) * @dnode: destination node for the message * @new_bundle: if this call made a new bundle or not * * Return: "true" if the new message skb is potential for bundling this time or * later, in the case a bundling has been done this time, the skb is consumed * (the skb pointer = NULL). * Otherwise, "false" if the skb cannot be bundled at all. */ bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle) { struct tipc_msg *msg, *inner, *outer; u32 tsz; /* First, check if the new buffer is suitable for bundling */ msg = buf_msg(*skb); if (msg_user(msg) == MSG_FRAGMENTER) return false; if (msg_user(msg) == TUNNEL_PROTOCOL) return false; if (msg_user(msg) == BCAST_PROTOCOL) return false; if (mss <= INT_H_SIZE + msg_size(msg)) return false; /* Ok, but the last/target buffer can be empty? */ if (unlikely(!tskb)) return true; /* Is it a bundle already? Try to bundle the new message to it */ if (msg_user(buf_msg(tskb)) == MSG_BUNDLER) { *new_bundle = false; goto bundle; } /* Make a new bundle of the two messages if possible */ tsz = msg_size(buf_msg(tskb)); if (unlikely(mss < BUF_ALIGN(INT_H_SIZE + tsz) + msg_size(msg))) return true; if (unlikely(pskb_expand_head(tskb, INT_H_SIZE, mss - tsz - INT_H_SIZE, GFP_ATOMIC))) return true; inner = buf_msg(tskb); skb_push(tskb, INT_H_SIZE); outer = buf_msg(tskb); tipc_msg_init(msg_prevnode(inner), outer, MSG_BUNDLER, 0, INT_H_SIZE, dnode); msg_set_importance(outer, msg_importance(inner)); msg_set_size(outer, INT_H_SIZE + tsz); msg_set_msgcnt(outer, 1); *new_bundle = true; bundle: if (likely(tipc_msg_bundle(tskb, msg, mss))) { consume_skb(*skb); *skb = NULL; } return true; } /** * tipc_msg_extract(): extract bundled inner packet from buffer * @skb: buffer to be extracted from. * @iskb: extracted inner buffer, to be returned * @pos: position in outer message of msg to be extracted. * Returns position of next msg. * Consumes outer buffer when last packet extracted * Return: true when there is an extracted buffer, otherwise false */ bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos) { struct tipc_msg *hdr, *ihdr; int imsz; *iskb = NULL; if (unlikely(skb_linearize(skb))) goto none; hdr = buf_msg(skb); if (unlikely(*pos > (msg_data_sz(hdr) - MIN_H_SIZE))) goto none; ihdr = (struct tipc_msg *)(msg_data(hdr) + *pos); imsz = msg_size(ihdr); if ((*pos + imsz) > msg_data_sz(hdr)) goto none; *iskb = tipc_buf_acquire(imsz, GFP_ATOMIC); if (!*iskb) goto none; skb_copy_to_linear_data(*iskb, ihdr, imsz); if (unlikely(!tipc_msg_validate(iskb))) goto none; *pos += BUF_ALIGN(imsz); return true; none: kfree_skb(skb); kfree_skb(*iskb); *iskb = NULL; return false; } /** * tipc_msg_reverse(): swap source and destination addresses and add error code * @own_node: originating node id for reversed message * @skb: buffer containing message to be reversed; will be consumed * @err: error code to be set in message, if any * Replaces consumed buffer with new one when successful * Return: true if success, otherwise false */ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { struct sk_buff *_skb = *skb; struct tipc_msg *_hdr, *hdr; int hlen, dlen; if (skb_linearize(_skb)) goto exit; _hdr = buf_msg(_skb); dlen = min_t(uint, msg_data_sz(_hdr), MAX_FORWARD_SIZE); hlen = msg_hdr_sz(_hdr); if (msg_dest_droppable(_hdr)) goto exit; if (msg_errcode(_hdr)) goto exit; /* Never return SHORT header */ if (hlen == SHORT_H_SIZE) hlen = BASIC_H_SIZE; /* Don't return data along with SYN+, - sender has a clone */ if (msg_is_syn(_hdr) && err == TIPC_ERR_OVERLOAD) dlen = 0; /* Allocate new buffer to return */ *skb = tipc_buf_acquire(hlen + dlen, GFP_ATOMIC); if (!*skb) goto exit; memcpy((*skb)->data, _skb->data, msg_hdr_sz(_hdr)); memcpy((*skb)->data + hlen, msg_data(_hdr), dlen); /* Build reverse header in new buffer */ hdr = buf_msg(*skb); msg_set_hdr_sz(hdr, hlen); msg_set_errcode(hdr, err); msg_set_non_seq(hdr, 0); msg_set_origport(hdr, msg_destport(_hdr)); msg_set_destport(hdr, msg_origport(_hdr)); msg_set_destnode(hdr, msg_prevnode(_hdr)); msg_set_prevnode(hdr, own_node); msg_set_orignode(hdr, own_node); msg_set_size(hdr, hlen + dlen); skb_orphan(_skb); kfree_skb(_skb); return true; exit: kfree_skb(_skb); *skb = NULL; return false; } bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy) { struct sk_buff *skb, *_skb; skb_queue_walk(msg, skb) { _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) { __skb_queue_purge(cpy); pr_err_ratelimited("Failed to clone buffer chain\n"); return false; } __skb_queue_tail(cpy, _skb); } return true; } /** * tipc_msg_lookup_dest(): try to find new destination for named message * @net: pointer to associated network namespace * @skb: the buffer containing the message. * @err: error code to be used by caller if lookup fails * Does not consume buffer * Return: true if a destination is found, false otherwise */ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) { struct tipc_msg *msg = buf_msg(skb); u32 scope = msg_lookup_scope(msg); u32 self = tipc_own_addr(net); u32 inst = msg_nameinst(msg); struct tipc_socket_addr sk; struct tipc_uaddr ua; if (!msg_isdata(msg)) return false; if (!msg_named(msg)) return false; if (msg_errcode(msg)) return false; *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, scope, msg_nametype(msg), inst, inst); sk.node = tipc_scope2node(net, scope); if (!tipc_nametbl_lookup_anycast(net, &ua, &sk)) return false; msg_incr_reroute_cnt(msg); if (sk.node != self) msg_set_prevnode(msg, self); msg_set_destnode(msg, sk.node); msg_set_destport(msg, sk.ref); *err = TIPC_OK; return true; } /* tipc_msg_assemble() - assemble chain of fragments into one message */ bool tipc_msg_assemble(struct sk_buff_head *list) { struct sk_buff *skb, *tmp = NULL; if (skb_queue_len(list) == 1) return true; while ((skb = __skb_dequeue(list))) { skb->next = NULL; if (tipc_buf_append(&tmp, &skb)) { __skb_queue_tail(list, skb); return true; } if (!tmp) break; } __skb_queue_purge(list); __skb_queue_head_init(list); pr_warn("Failed do assemble buffer\n"); return false; } /* tipc_msg_reassemble() - clone a buffer chain of fragments and * reassemble the clones into one message */ bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq) { struct sk_buff *skb, *_skb; struct sk_buff *frag = NULL; struct sk_buff *head = NULL; int hdr_len; /* Copy header if single buffer */ if (skb_queue_len(list) == 1) { skb = skb_peek(list); hdr_len = skb_headroom(skb) + msg_hdr_sz(buf_msg(skb)); _skb = __pskb_copy(skb, hdr_len, GFP_ATOMIC); if (!_skb) return false; __skb_queue_tail(rcvq, _skb); return true; } /* Clone all fragments and reassemble */ skb_queue_walk(list, skb) { frag = skb_clone(skb, GFP_ATOMIC); if (!frag) goto error; frag->next = NULL; if (tipc_buf_append(&head, &frag)) break; if (!head) goto error; } __skb_queue_tail(rcvq, frag); return true; error: pr_warn("Failed do clone local mcast rcv buffer\n"); kfree_skb(head); return false; } bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy) { struct sk_buff *skb, *_skb; skb_queue_walk(msg, skb) { _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) { __skb_queue_purge(cpy); return false; } msg_set_destnode(buf_msg(_skb), dst); __skb_queue_tail(cpy, _skb); } return true; } /* tipc_skb_queue_sorted(); sort pkt into list according to sequence number * @list: list to be appended to * @seqno: sequence number of buffer to add * @skb: buffer to add */ bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb) { struct sk_buff *_skb, *tmp; if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) { __skb_queue_head(list, skb); return true; } if (more(seqno, buf_seqno(skb_peek_tail(list)))) { __skb_queue_tail(list, skb); return true; } skb_queue_walk_safe(list, _skb, tmp) { if (more(seqno, buf_seqno(_skb))) continue; if (seqno == buf_seqno(_skb)) break; __skb_queue_before(list, _skb, skb); return true; } kfree_skb(skb); return false; } void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq) { if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) __skb_queue_tail(xmitq, skb); } |
151 2 148 148 4 4 4 4 152 150 147 1 28 118 157 2 7 2 154 28 123 161 161 2 4 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | /* * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved. * Copyright (c) 2010 Voltaire Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ #include <linux/export.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> #include <linux/module.h> #include "core_priv.h" static struct { const struct rdma_nl_cbs *cb_table; /* Synchronizes between ongoing netlink commands and netlink client * unregistration. */ struct rw_semaphore sem; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; bool rdma_nl_chk_listeners(unsigned int group) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net); return netlink_has_listeners(rnet->nl_sock, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, }; /* * This BUILD_BUG_ON is intended to catch addition of new * RDMA netlink protocol without updating the array above. */ BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6); if (type >= RDMA_NL_NUM_CLIENTS) return false; return op < max_num_ops[type]; } static const struct rdma_nl_cbs * get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op) { const struct rdma_nl_cbs *cb_table; /* * Currently only NLDEV client is supporting netlink commands in * non init_net net namespace. */ if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV) return NULL; cb_table = READ_ONCE(rdma_nl_types[type].cb_table); if (!cb_table) { /* * Didn't get valid reference of the table, attempt module * load once. */ up_read(&rdma_nl_types[type].sem); request_module("rdma-netlink-subsys-%u", type); down_read(&rdma_nl_types[type].sem); cb_table = READ_ONCE(rdma_nl_types[type].cb_table); } if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) return NULL; return cb_table; } void rdma_nl_register(unsigned int index, const struct rdma_nl_cbs cb_table[]) { if (WARN_ON(!is_nl_msg_valid(index, 0)) || WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table))) return; /* Pairs with the READ_ONCE in is_nl_valid() */ smp_store_release(&rdma_nl_types[index].cb_table, cb_table); } EXPORT_SYMBOL(rdma_nl_register); void rdma_nl_unregister(unsigned int index) { down_write(&rdma_nl_types[index].sem); rdma_nl_types[index].cb_table = NULL; up_write(&rdma_nl_types[index].sem); } EXPORT_SYMBOL(rdma_nl_unregister); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int len, int client, int op, int flags) { *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags); if (!*nlh) return NULL; return nlmsg_data(*nlh); } EXPORT_SYMBOL(ibnl_put_msg); int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int len, void *data, int type) { if (nla_put(skb, type, len, data)) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } return 0; } EXPORT_SYMBOL(ibnl_put_attr); static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int type = nlh->nlmsg_type; unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); const struct rdma_nl_cbs *cb_table; int err = -EINVAL; if (!is_nl_msg_valid(index, op)) return -EINVAL; down_read(&rdma_nl_types[index].sem); cb_table = get_cb_table(skb, index, op); if (!cb_table) goto done; if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && !netlink_capable(skb, CAP_NET_ADMIN)) { err = -EPERM; goto done; } /* * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't * mistakenly call the .dump() function. */ if (index == RDMA_NL_LS) { if (cb_table[op].doit) err = cb_table[op].doit(skb, nlh, extack); goto done; } /* FIXME: Convert IWCM to properly handle doit callbacks */ if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { struct netlink_dump_control c = { .dump = cb_table[op].dump, }; if (c.dump) err = netlink_dump_start(skb->sk, skb, nlh, &c); goto done; } if (cb_table[op].doit) err = cb_table[op].doit(skb, nlh, extack); done: up_read(&rdma_nl_types[index].sem); return err; } /* * This function is similar to netlink_rcv_skb with one exception: * It calls to the callback for the netlink messages without NLM_F_REQUEST * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed * for that consumer only. */ static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack = {}; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* * Generally speaking, the only requests are handled * by the kernel, but RDMA_NL_LS is different, because it * runs backward netlink scheme. Kernel initiates messages * and waits for reply with data to keep pathrecord cache * in sync. */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) && (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } static void rdma_nl_rcv(struct sk_buff *skb) { rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); } int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast); int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); int err; err = netlink_unicast(rnet->nl_sock, skb, pid, 0); return (err < 0) ? err : 0; } EXPORT_SYMBOL(rdma_nl_unicast_wait); int rdma_nl_multicast(struct net *net, struct sk_buff *skb, unsigned int group, gfp_t flags) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags); } EXPORT_SYMBOL(rdma_nl_multicast); void rdma_nl_init(void) { int idx; for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) init_rwsem(&rdma_nl_types[idx].sem); } void rdma_nl_exit(void) { int idx; for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) WARN(rdma_nl_types[idx].cb_table, "Netlink client %d wasn't released prior to unloading %s\n", idx, KBUILD_MODNAME); } int rdma_nl_net_init(struct rdma_dev_net *rnet) { struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, }; struct sock *nls; nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg); if (!nls) return -ENOMEM; nls->sk_sndtimeo = 10 * HZ; rnet->nl_sock = nls; return 0; } void rdma_nl_net_exit(struct rdma_dev_net *rnet) { netlink_kernel_release(rnet->nl_sock); } MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA); |
1122 199 12 4125 19 659 44 74 859 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. */ struct shared_policy { struct rb_root root; rwlock_t lock; }; struct sp_node { struct rb_node nd; pgoff_t start, end; struct mempolicy *policy; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else struct mempolicy {}; static inline struct mempolicy *get_task_policy(struct task_struct *p) { return NULL; } static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *pol) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { *ilx = 0; return NULL; } static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; } #endif /* CONFIG_NUMA */ #endif |
57 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | #ifndef IOU_REQ_REF_H #define IOU_REQ_REF_H #include <linux/atomic.h> #include <linux/io_uring_types.h> /* * Shamelessly stolen from the mm implementation of page reference checking, * see commit f958d7b528b1 for details. */ #define req_ref_zero_or_close_to_overflow(req) \ ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) static inline bool req_ref_inc_not_zero(struct io_kiocb *req) { WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); return atomic_inc_not_zero(&req->refs); } static inline bool req_ref_put_and_test(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_REFCOUNT))) return true; WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); return atomic_dec_and_test(&req->refs); } static inline void req_ref_get(struct io_kiocb *req) { WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); atomic_inc(&req->refs); } static inline void req_ref_put(struct io_kiocb *req) { WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); atomic_dec(&req->refs); } static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { req->flags |= REQ_F_REFCOUNT; atomic_set(&req->refs, nr); } } static inline void io_req_set_refcount(struct io_kiocb *req) { __io_req_set_refcount(req, 1); } #endif |
25 8 8631 3 8631 6275 8216 8394 4267 563 561 563 549 8387 7591 8388 322 1681 7424 7582 8394 7583 5 5 45 36 45 2 8615 8625 45 8388 2 2 645 8223 8215 796 8165 3244 3178 4950 6105 4852 8 2243 609 4405 5993 4476 4495 5828 563 515 128 563 8269 6532 7614 8279 8390 8052 8369 6881 4940 7882 756 7335 171 8390 7 179 171 164 176 321 7998 642 8357 511 8382 180 1152 911 557 1573 1577 42 1155 447 479 1154 8385 7 8382 8153 536 8389 5833 5325 8391 8389 8401 8320 5828 7694 6318 8405 1577 1579 1582 1582 5 5 5 5 5 5 5 5 16 1578 1576 4620 77 4817 8390 4628 4815 4819 3 1549 148 148 8388 8386 8385 7 8389 536 8149 8404 8385 8 2 8388 4 5 534 16 16 8146 8385 22 8405 8390 4 4 4 4 6573 3791 2188 1011 1582 1577 3 1579 1577 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is * a 1-to-1 mapping for units to possible cpus. These units are grouped * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done by offsets into a unit's address space. Ie., an * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear * and even sparse. Access is handled by configuring percpu base * registers according to the cpu to unit mappings and offsetting the * base address using pcpu_unit_size. * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services * are not online yet. In short, the first chunk is structured like so: * * <Static | [Reserved] | Dynamic> * * The static data is copied from the original section managed by the * linker. The reserved section, if non-zero, primarily manages static * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT * flag should be passed. All memcg-aware allocations are sharing one set * of chunks and all unaccounted allocations and allocations performed * by processes belonging to the root memory cgroup are using the second set. * * The allocator tries to allocate from the fullest chunk first. Each chunk * is managed by a bitmap with metadata blocks. The allocation map is updated * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in * the page's index. Lastly, units are lazily backed and grow in unison. * * There is a unique conversion that goes on here between bytes and bits. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk * tracks the number of pages it is responsible for in nr_pages. Helper * functions are used to convert from between the bytes, bits, and blocks. * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> #include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/kmemleak.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/memcontrol.h> #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/io.h> #define CREATE_TRACE_POINTS #include <trace/events/percpu.h> #include "percpu-internal.h" /* * The slots are sorted by the size of the biggest continuous free area. * 1-31 bytes share the same slot. */ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3 #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ (void __percpu *)((unsigned long)(addr) - \ (unsigned long)pcpu_base_addr + \ (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ (void __force *)((unsigned long)(ptr) + \ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif #else /* CONFIG_SMP */ /* on UP, it's always identity mapped */ #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ static int pcpu_unit_pages __ro_after_init; static int pcpu_unit_size __ro_after_init; static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; static int pcpu_free_slot __ro_after_init; int pcpu_sidelined_slot __ro_after_init; int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ static unsigned int pcpu_low_unit_cpu __ro_after_init; static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init; static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ static int pcpu_nr_groups __ro_after_init; static const unsigned long *pcpu_group_offsets __ro_after_init; static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. When the reserved * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ /* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count. */ int pcpu_nr_empty_pop_pages; /* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets * allocated/deallocated, it is allocated/deallocated in all units of a chunk * and increments/decrements this count by 1). */ static unsigned long pcpu_nr_populated; /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one * empty chunk. */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_async_enabled __read_mostly; static bool pcpu_atomic_alloc_failed; static void pcpu_schedule_balance_work(void) { if (pcpu_async_enabled) schedule_work(&pcpu_balance_work); } /** * pcpu_addr_in_chunk - check if the address is served from this chunk * @chunk: chunk of interest * @addr: percpu address * * RETURNS: * True if the address is served from this chunk. */ static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { void *start_addr, *end_addr; if (!chunk) return false; start_addr = chunk->base_addr + chunk->start_offset; end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - chunk->end_offset; return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) return pcpu_free_slot; return __pcpu_size_to_slot(size); } static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { const struct pcpu_block_md *chunk_md = &chunk->chunk_md; if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk_md->contig_hint == 0) return 0; return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { page->index = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { return (struct pcpu_chunk *)page->index; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) { return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) { return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return (unsigned long)chunk->base_addr + pcpu_unit_page_offset(cpu, page_idx); } /* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets. */ static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) { return chunk->alloc_map + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); } static unsigned long pcpu_off_to_block_index(int off) { return off / PCPU_BITMAP_BLOCK_BITS; } static unsigned long pcpu_off_to_block_off(int off) { return off & (PCPU_BITMAP_BLOCK_BITS - 1); } static unsigned long pcpu_block_off_to_off(int index, int off) { return index * PCPU_BITMAP_BLOCK_BITS + off; } /** * pcpu_check_block_hint - check against the contig hint * @block: block of interest * @bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * * Check to see if the allocation can fit in the block's contig hint. * Note, a chunk uses the same hints as a block so this can also check against * the chunk's contig hint. */ static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits, size_t align) { int bit_off = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; return bit_off + bits <= block->contig_hint; } /* * pcpu_next_hint - determine which hint to use * @block: block of interest * @alloc_bits: size of allocation * * This determines if we should scan based on the scan_hint or first_free. * In general, we want to scan from first_free to fulfill allocations by * first fit. However, if we know a scan_hint at position scan_hint_start * cannot fulfill an allocation, we can begin scanning from there knowing * the contig_hint will be our fallback. */ static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) { /* * The three conditions below determine if we can skip past the * scan_hint. First, does the scan hint exist. Second, is the * contig_hint after the scan_hint (possibly not true iff * contig_hint == scan_hint). Third, is the allocation request * larger than the scan_hint. */ if (block->scan_hint && block->contig_hint_start > block->scan_hint_start && alloc_bits > block->scan_hint) return block->scan_hint_start + block->scan_hint; return block->first_free; } /** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Helper function for pcpu_for_each_md_free_region. It checks * block->contig_hint and performs aggregation across blocks to find the * next hint. It modifies bit_off and bits in-place to be consumed in the * loop. */ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; return; } /* * This checks three things. First is there a contig_hint to * check. Second, have we checked this hint before by * comparing the block_off. Third, is this the same as the * right contig hint. In the last case, it spills over into * the next block and should be handled by the contig area * across blocks code. */ *bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off && *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { *bit_off = pcpu_block_off_to_off(i, block->contig_hint_start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bits = block->right_free; *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; } } /** * pcpu_next_fit_region - finds fit areas for a given allocation request * @chunk: chunk of interest * @alloc_bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * @bit_off: chunk offset * @bits: size of free area * * Finds the next free region that is viable for use with a given size and * alignment. This only returns if there is a valid area to be used for this * allocation. block->first_free is returned if the allocation request fits * within the block to see if the request can be fulfilled prior to the contig * hint. */ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, int align, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (*bits >= alloc_bits) return; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; } /* check block->contig_hint */ *bits = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; /* * This uses the block offset to determine if this has been * checked in the prior iteration. */ if (block->contig_hint && block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) { int start = pcpu_next_hint(block, alloc_bits); *bits += alloc_bits + block->contig_hint_start - start; *bit_off = pcpu_block_off_to_off(i, start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, align); *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; *bit_off = pcpu_block_off_to_off(i, *bit_off); if (*bits >= alloc_bits) return; } /* no valid offsets were found - fail condition */ *bit_off = pcpu_chunk_map_bits(chunk); } /* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in * bits of the free area @bits. pcpu_for_each_fit_region only returns when * a fit is found for the allocation request. */ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits) + 1, \ pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits), \ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits))) /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) return kzalloc(size, gfp); else return __vmalloc(size, gfp | __GFP_ZERO); } /** * pcpu_mem_free - free memory * @ptr: memory to free * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr) { kvfree(ptr); } static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, bool move_front) { if (chunk != pcpu_reserved_chunk) { if (move_front) list_move(&chunk->list, &pcpu_chunk_lists[slot]); else list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]); } } static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) { __pcpu_chunk_move(chunk, slot, true); } /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); /* leave isolated chunks in-place */ if (chunk->isolated) return; if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); } static void pcpu_isolate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (!chunk->isolated) { chunk->isolated = true; pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages; } list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]); } static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (chunk->isolated) { chunk->isolated = false; pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages; pcpu_chunk_relocate(chunk, -1); } } /* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest * @nr: nr of empty pages * * This is used to keep track of the empty pages now based on the premise * a md_block covers a page. The hint update functions recognize if a block * is made full or broken to calculate deltas for keeping track of free pages. */ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk && !chunk->isolated) pcpu_nr_empty_pop_pages += nr; } /* * pcpu_region_overlap - determines if two regions overlap * @a: start of first region, inclusive * @b: end of first region, exclusive * @x: start of second region, inclusive * @y: end of second region, exclusive * * This is used to determine if the hint region [a, b) overlaps with the * allocated region [x, y). */ static inline bool pcpu_region_overlap(int a, int b, int x, int y) { return (a < y) && (x < b); } /** * pcpu_block_update - updates a block given a free area * @block: block of interest * @start: start offset in block * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is * expected to be the entirety of the free area within a block. Chooses * the best starting offset if the contig hints are equal. */ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) { int contig = end - start; block->first_free = min(block->first_free, start); if (start == 0) block->left_free = contig; if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) { /* promote the old contig_hint to be the new scan_hint */ if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) { block->scan_hint_start = block->contig_hint_start; block->scan_hint = block->contig_hint; } else if (start < block->scan_hint_start) { /* * The old contig_hint == scan_hint. But, the * new contig is larger so hold the invariant * scan_hint_start < contig_hint_start. */ block->scan_hint = 0; } } else { block->scan_hint = 0; } block->contig_hint_start = start; block->contig_hint = contig; } else if (contig == block->contig_hint) { if (block->contig_hint_start && (!start || __ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */ block->contig_hint_start = start; if (start < block->scan_hint_start && block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint. */ block->scan_hint_start = start; block->scan_hint = contig; } } else { /* * The region is smaller than the contig_hint. So only update * the scan_hint if it is larger than or equal and farther than * the current scan_hint. */ if ((start < block->contig_hint_start && (contig > block->scan_hint || (contig == block->scan_hint && start > block->scan_hint_start)))) { block->scan_hint_start = start; block->scan_hint = contig; } } } /* * pcpu_block_update_scan - update a block given a free area from a scan * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Finding the final allocation spot first goes through pcpu_find_block_fit() * to find a block that can hold the allocation and then pcpu_alloc_area() * where a scan is used. When allocations require specific alignments, * we can inadvertently create holes which will not be seen in the alloc * or free paths. * * This takes a given free area hole and updates a block as it may change the * scan_hint. We need to scan backwards to ensure we don't miss free bits * from alignment. */ static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, int bits) { int s_off = pcpu_off_to_block_off(bit_off); int e_off = s_off + bits; int s_index, l_bit; struct pcpu_block_md *block; if (e_off > PCPU_BITMAP_BLOCK_BITS) return; s_index = pcpu_off_to_block_index(bit_off); block = chunk->md_blocks + s_index; /* scan backwards in case of alignment skipping free bits */ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); s_off = (s_off == l_bit) ? 0 : l_bit + 1; pcpu_block_update(block, s_off, e_off); } /** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. * A full scan can be avoided on the allocation path as this is triggered * if we broke the contig_hint. In doing so, the scan_hint will be before * the contig_hint or after if the scan_hint == contig_hint. This cannot * be prevented on freeing as we want to find the largest area possibly * spanning blocks. */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits; /* promote scan_hint to contig_hint */ if (!full_scan && chunk_md->scan_hint) { bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; chunk_md->contig_hint_start = chunk_md->scan_hint_start; chunk_md->contig_hint = chunk_md->scan_hint; chunk_md->scan_hint = 0; } else { bit_off = chunk_md->first_free; chunk_md->contig_hint = 0; } bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits); } /** * pcpu_block_refresh_hint * @chunk: chunk of interest * @index: index of the metadata block * * Scans over the block beginning at first_free and updates the block * metadata accordingly. */ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { start = block->scan_hint_start + block->scan_hint; block->contig_hint_start = block->scan_hint_start; block->contig_hint = block->scan_hint; block->scan_hint = 0; } else { start = block->first_free; block->contig_hint = 0; } block->right_free = 0; /* iterate over free areas and update the contig hints */ for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) pcpu_block_update(block, start, end); } /** * pcpu_block_update_hint_alloc - update hint on allocation path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. The metadata only has to be * refreshed by a full scan iff the chunk's contig hint is broken. Block level * scans are required if the block's contig hint is broken. */ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Update s_block. */ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * block->first_free must be updated if the allocation takes its place. * If the allocation breaks the contig_hint, a scan is required to * restore this hint. */ if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), PCPU_BITMAP_BLOCK_BITS, s_off + bits); if (pcpu_region_overlap(s_block->scan_hint_start, s_block->scan_hint_start + s_block->scan_hint, s_off, s_off + bits)) s_block->scan_hint = 0; if (pcpu_region_overlap(s_block->contig_hint_start, s_block->contig_hint_start + s_block->contig_hint, s_off, s_off + bits)) { /* block contig hint is broken - scan to fix it */ if (!s_off) s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index); } else { /* update left and right contig manually */ s_block->left_free = min(s_block->left_free, s_off); if (s_index == e_index) s_block->right_free = min_t(int, s_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); else s_block->right_free = 0; } /* * Update e_block. */ if (s_index != e_index) { if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * When the allocation is across blocks, the end is along * the left part of the e_block. */ e_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, e_off); if (e_off == PCPU_BITMAP_BLOCK_BITS) { /* reset the block */ e_block++; } else { if (e_off > e_block->scan_hint_start) e_block->scan_hint = 0; e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); } else { e_block->right_free = min_t(int, e_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); } } /* update in-between md_blocks */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->scan_hint = 0; block->contig_hint = 0; block->left_free = 0; block->right_free = 0; } } /* * If the allocation is not atomic, some blocks may not be * populated with pages, while we account it here. The number * of pages will be added back with pcpu_chunk_populated() * when populating pages. */ if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); if (pcpu_region_overlap(chunk_md->scan_hint_start, chunk_md->scan_hint_start + chunk_md->scan_hint, bit_off, bit_off + bits)) chunk_md->scan_hint = 0; /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ if (pcpu_region_overlap(chunk_md->contig_hint_start, chunk_md->contig_hint_start + chunk_md->contig_hint, bit_off, bit_off + bits)) pcpu_chunk_refresh_hint(chunk, false); } /** * pcpu_block_update_hint_free - updates the block hints on the free path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. This avoids a blind block * refresh by making use of the block contig hints. If this fails, it scans * forward and backward to determine the extent of the free area. This is * capped at the boundary of blocks. * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating * over the block metadata to update chunk_md->contig_hint. * chunk_md->contig_hint may be off by up to a page, but it will never be more * than the available space. If the contig hint is contained in one block, it * will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) { int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ int start, end; /* start and end of the whole free area */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Check if the freed area aligns with the block->contig_hint. * If it does, then the scan to find the beginning/end of the * larger free area can be avoided. * * start and end refer to beginning and end of the free area * within each their respective blocks. This is not necessarily * the entire free area as it may span blocks past the beginning * or end of the block. */ start = s_off; if (s_off == s_block->contig_hint + s_block->contig_hint_start) { start = s_block->contig_hint_start; } else { /* * Scan backwards to find the extent of the free area. * find_last_bit returns the starting bit, so if the start bit * is returned, that means there was no last bit and the * remainder of the chunk is free. */ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), start); start = (start == l_bit) ? 0 : l_bit + 1; } end = e_off; if (e_off == e_block->contig_hint_start) end = e_block->contig_hint_start + e_block->contig_hint; else end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, end); /* update s_block */ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ if (end == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; block->scan_hint = 0; block->contig_hint_start = 0; block->contig_hint = PCPU_BITMAP_BLOCK_BITS; block->left_free = PCPU_BITMAP_BLOCK_BITS; block->right_free = PCPU_BITMAP_BLOCK_BITS; } } if (nr_empty_pages) pcpu_update_empty_pages(chunk, nr_empty_pages); /* * Refresh chunk metadata when the free makes a block free or spans * across blocks. The contig_hint may be off by up to a page, but if * the contig_hint is contained in a block, it will be accurate with * the else condition below. */ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk, true); else pcpu_block_update(&chunk->chunk_md, pcpu_block_off_to_off(s_index, start), end); } /** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of area * @next_off: return value for the next offset to start searching * * For atomic allocations, check if the backing pages are populated. * * RETURNS: * Bool if the backing pages are populated. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. */ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { unsigned int start, end; start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); start = find_next_zero_bit(chunk->populated, end, start); if (start >= end) return true; end = find_next_bit(chunk->populated, end, start + 1); *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } /** * pcpu_find_block_fit - finds the block index to start searching * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * * Given a chunk and an allocation spec, find the offset to begin searching * for a free region. This iterates over the bitmap metadata blocks to * find an offset that will be guaranteed to fit the requirements. It is * not quite first fit as if the allocation does not fit in the contig hint * of a block or chunk, it is skipped. This errs on the side of caution * to prevent excess iteration. Poor alignment can cause the allocator to * skip over blocks and chunks that have valid free areas. * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found. */ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off; /* * This is an optimization to prevent scanning by assuming if the * allocation cannot fit in the global hint, there is memory pressure * and creating a new chunk would happen soon. */ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) return -1; bit_off = pcpu_next_hint(chunk_md, alloc_bits); bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, &next_off)) break; bit_off = next_off; bits = 0; } if (bit_off == pcpu_chunk_map_bits(chunk)) return -1; return bit_off; } /* * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() * @map: the address to base the search on * @size: the bitmap size in bits * @start: the bitnumber to start searching at * @nr: the number of zeroed bits we're looking for * @align_mask: alignment mask for zero area * @largest_off: offset of the largest area skipped * @largest_bits: size of the largest area skipped * * The @align_mask should be one less than a power of 2. * * This is a modified version of bitmap_find_next_zero_area_off() to remember * the largest area that was skipped. This is imperfect, but in general is * good enough. The largest remembered region is the largest failed region * seen. This does not include anything we possibly skipped due to alignment. * pcpu_block_update_scan() does scan backwards to try and recover what was * lost to alignment. While this can cause scanning to miss earlier possible * free areas, smaller allocations will eventually fill those holes. */ static unsigned long pcpu_find_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned long nr, unsigned long align_mask, unsigned long *largest_off, unsigned long *largest_bits) { unsigned long index, end, i, area_off, area_bits; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index, align_mask); area_off = index; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { area_bits = i - area_off; /* remember largest unused area with best alignment */ if (area_bits > *largest_bits || (area_bits == *largest_bits && *largest_off && (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { *largest_off = area_off; *largest_bits = area_bits; } start = i + 1; goto again; } return index; } /** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE) * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an * allocation of @alloc_bits with alignment @align. It needs to scan * the allocation map because if it fits within the block's contig hint, * @start will be block->first_free. This is an attempt to fill the * allocation prior to breaking the contig hint. The allocation and * boundary maps are updated accordingly if it confirms a valid * free area. * * RETURNS: * Allocated addr offset in @chunk on success. * -1 if no matching area is found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; size_t align_mask = (align) ? (align - 1) : 0; unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); oslot = pcpu_chunk_slot(chunk); /* * Search to find a fit. */ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, pcpu_chunk_map_bits(chunk)); bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask, &area_off, &area_bits); if (bit_off >= end) return -1; if (area_bits) pcpu_block_update_scan(chunk, area_off, area_bits); /* update alloc map */ bitmap_set(chunk->alloc_map, bit_off, alloc_bits); /* update boundary map */ set_bit(bit_off, chunk->bound_map); bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); set_bit(bit_off + alloc_bits, chunk->bound_map); chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ if (bit_off == chunk_md->first_free) chunk_md->first_free = find_next_zero_bit( chunk->alloc_map, pcpu_chunk_map_bits(chunk), bit_off + alloc_bits); pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); pcpu_chunk_relocate(chunk, oslot); return bit_off * PCPU_MIN_ALLOC_SIZE; } /** * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest * @off: addr offset into chunk * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. * * RETURNS: * Number of freed bytes. */ static int pcpu_free_area(struct pcpu_chunk *chunk, int off) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot, freed; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); oslot = pcpu_chunk_slot(chunk); bit_off = off / PCPU_MIN_ALLOC_SIZE; /* find end index */ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); bits = end - bit_off; bitmap_clear(chunk->alloc_map, bit_off, bits); freed = bits * PCPU_MIN_ALLOC_SIZE; /* update metadata */ chunk->free_bytes += freed; /* update first free bit */ chunk_md->first_free = min(chunk_md->first_free, bit_off); pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); return freed; } static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) { block->scan_hint = 0; block->contig_hint = nr_bits; block->left_free = nr_bits; block->right_free = nr_bits; block->first_free = 0; block->nr_bits = nr_bits; } static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; /* init the chunk's block */ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); md_block++) pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); } /** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served * @map_size: size of the region served * * This is responsible for creating the chunks that serve the first chunk. The * base_addr is page aligned down of @tmp_addr while the region end is page * aligned up. Offsets are kept track of to determine the region served. All * this is done to appease the bitmap allocator in avoiding partial blocks. * * RETURNS: * Chunk serving the region at @tmp_addr of @map_size. */ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; unsigned long aligned_addr; int start_offset, offset_bits, region_size, region_bits; size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; start_offset = tmp_addr - aligned_addr; region_size = ALIGN(start_offset + map_size, PAGE_SIZE); /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); INIT_LIST_HEAD(&chunk->list); chunk->base_addr = (void *)aligned_addr; chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->alloc_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->bound_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->md_blocks) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); #ifdef CONFIG_MEMCG_KMEM /* first chunk is free to use */ chunk->obj_cgroups = NULL; #endif pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; chunk->nr_empty_pop_pages = chunk->nr_pages; chunk->free_bytes = map_size; if (chunk->start_offset) { /* hide the beginning of the bitmap */ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, 0, offset_bits); set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); chunk->chunk_md.first_free = offset_bits; pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } if (chunk->end_offset) { /* hide the end of the bitmap */ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, chunk->bound_map); set_bit(region_bits, chunk->bound_map); pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); } return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; INIT_LIST_HEAD(&chunk->list); chunk->nr_pages = pcpu_unit_pages; region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; #ifdef CONFIG_MEMCG_KMEM if (!mem_cgroup_kmem_disabled()) { chunk->obj_cgroups = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * sizeof(struct obj_cgroup *), gfp); if (!chunk->obj_cgroups) goto objcg_fail; } #endif pcpu_init_md_blocks(chunk); /* init metadata */ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; #ifdef CONFIG_MEMCG_KMEM objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif md_blocks_fail: pcpu_mem_free(chunk->bound_map); bound_map_fail: pcpu_mem_free(chunk->alloc_map); alloc_map_fail: pcpu_mem_free(chunk); return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; #ifdef CONFIG_MEMCG_KMEM pcpu_mem_free(chunk->obj_cgroups); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } /** * pcpu_chunk_populated - post-population bookkeeping * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; pcpu_nr_populated += nr; pcpu_update_empty_pages(chunk, nr); } /** * pcpu_chunk_depopulated - post-depopulation bookkeeping * @chunk: pcpu_chunk which got depopulated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been depopulated from @chunk. * Update the bookkeeping information accordingly. Must be called after * each successful depopulation. */ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; pcpu_nr_populated -= nr; pcpu_update_empty_pages(chunk, -nr); } /* * Chunk management implementation. * * To allow different implementations, chunk alloc/free and * [de]population are implemented in a separate file which is pulled * into this file and compiled together. The following functions * should be implemented. * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); #ifdef CONFIG_NEED_PER_CPU_KM #include "percpu-km.c" #else #include "percpu-vm.c" #endif /** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * This is an internal function that handles all but static allocations. * Static percpu address values should never be passed into the allocator. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { /* is it in the dynamic region (first chunk)? */ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; /* is it in the reserved region? */ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and * thus unmapped. Offset the address to the unit space of the * current processor before looking it up in the vmalloc * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } #ifdef CONFIG_MEMCG_KMEM static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { struct obj_cgroup *objcg; if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; objcg = current_obj_cgroup(); if (!objcg) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) return false; *objcgp = objcg; return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { if (!objcg) return; if (likely(chunk && chunk->obj_cgroups)) { obj_cgroup_get(objcg); chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, pcpu_obj_full_size(size)); rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); } } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; if (unlikely(!chunk->obj_cgroups)) return; objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; if (!objcg) return; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); } #else /* CONFIG_MEMCG_KMEM */ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } #endif /* CONFIG_MEMCG_KMEM */ /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * @gfp: allocation flags * * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN * then no warning will be triggered on invalid or failed allocation * requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; size_t bits, bit_align; gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; do_warn = !(gfp & __GFP_NOWARN); /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. * An allocation may have internal fragmentation from rounding up * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) align = PCPU_MIN_ALLOC_SIZE; size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); bits = size >> PCPU_MIN_ALLOC_SHIFT; bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n", size, align); return NULL; } if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) return NULL; if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure. */ if (gfp & __GFP_NOFAIL) { mutex_lock(&pcpu_alloc_mutex); } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } } spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; err = "alloc from reserved chunk failed"; goto fail_unlock; } restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { if (slot < PCPU_SLOT_FAIL_THRESHOLD) pcpu_chunk_move(chunk, 0); continue; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) { pcpu_reintegrate_chunk(chunk); goto area_found; } } } spin_unlock_irqrestore(&pcpu_lock, flags); if (is_atomic) { err = "atomic alloc failed, no space left"; goto fail; } /* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); } else { spin_lock_irqsave(&pcpu_lock, flags); } goto restart; area_found: pcpu_stats_area_alloc(chunk, size); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ if (!is_atomic) { unsigned int page_end, rs, re; rs = PFN_DOWN(off); page_end = PFN_UP(off + size); for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align, chunk->base_addr, off, ptr, pcpu_obj_full_size(size), gfp); pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); if (do_warn && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); if (!is_atomic) dump_stack(); if (!--warn_limit) pr_info("limit reached, disable warning\n"); } if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); } else { mutex_unlock(&pcpu_alloc_mutex); } pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } /** * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @gfp: allocation flags * * Allocate zero-filled percpu area of @size bytes aligned at @align. If * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can * be called from any context but is a lot more likely to fail. If @gfp * has __GFP_NOWARN then no warning will be triggered on invalid or failed * allocation requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) { return pcpu_alloc(size, align, false, gfp); } EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); /** * __alloc_percpu - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). */ void __percpu *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); /** * __alloc_reserved_percpu - allocate reserved percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Allocate zero-filled percpu area of @size bytes aligned at @align * from reserved percpu area if arch has set it up; otherwise, * allocation is served from the same dynamic area. Might sleep. * Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true, GFP_KERNEL); } /** * pcpu_balance_free - manage the amount of free chunks * @empty_only: free chunks only if there are no populated pages * * If empty_only is %false, reclaim all fully free chunks regardless of the * number of populated pages. Otherwise, only reclaim chunks that have no * populated pages. * * CONTEXT: * pcpu_lock (can be dropped temporarily) */ static void pcpu_balance_free(bool empty_only) { LIST_HEAD(to_free); struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot]; struct pcpu_chunk *chunk, *next; lockdep_assert_held(&pcpu_lock); /* * There's no reason to keep around multiple unused chunks and VM * areas can be scarce. Destroy all free chunks except for one. */ list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; if (!empty_only || chunk->nr_empty_pop_pages == 0) list_move(&chunk->list, &to_free); } if (list_empty(&to_free)) return; spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); cond_resched(); } spin_lock_irq(&pcpu_lock); } /** * pcpu_balance_populated - manage the amount of populated pages * * Maintain a certain amount of populated pages to satisfy atomic allocations. * It is possible that this is called when physical memory is scarce causing * OOM killer to be triggered. We should avoid doing so until an actual * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. * * CONTEXT: * pcpu_lock (can be dropped temporarily) */ static void pcpu_balance_populated(void) { /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; struct pcpu_chunk *chunk; int slot, nr_to_pop, ret; lockdep_assert_held(&pcpu_lock); /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic * allocs don't increase fragmentation. If atomic allocation * failed previously, always populate the maximum amount. This * should prevent atomic allocs larger than PAGE_SIZE from keeping * failing indefinitely; however, large atomic allocs are not * something we support properly and can be highly unreliable and * inefficient. */ retry_pop: if (pcpu_atomic_alloc_failed) { nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; /* best effort anyway, don't worry about synchronization */ pcpu_atomic_alloc_failed = false; } else { nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages, 0, PCPU_EMPTY_POP_PAGES_HIGH); } for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) { unsigned int nr_unpop = 0, rs, re; if (!nr_to_pop) break; list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) { nr_unpop = chunk->nr_pages - chunk->nr_populated; if (nr_unpop) break; } if (!nr_unpop) continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); spin_unlock_irq(&pcpu_lock); ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); cond_resched(); spin_lock_irq(&pcpu_lock); if (!ret) { nr_to_pop -= nr; pcpu_chunk_populated(chunk, rs, rs + nr); } else { nr_to_pop = 0; } if (!nr_to_pop) break; } } if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ spin_unlock_irq(&pcpu_lock); chunk = pcpu_create_chunk(gfp); cond_resched(); spin_lock_irq(&pcpu_lock); if (chunk) { pcpu_chunk_relocate(chunk, -1); goto retry_pop; } } } /** * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages * * Scan over chunks in the depopulate list and try to release unused populated * pages back to the system. Depopulated chunks are sidelined to prevent * repopulating these pages unless required. Fully free chunks are reintegrated * and freed accordingly (1 is kept around). If we drop below the empty * populated pages threshold, reintegrate the chunk if it has empty free pages. * Each chunk is scanned in the reverse order to keep populated pages close to * the beginning of the chunk. * * CONTEXT: * pcpu_lock (can be dropped temporarily) * */ static void pcpu_reclaim_populated(void) { struct pcpu_chunk *chunk; struct pcpu_block_md *block; int freed_page_start, freed_page_end; int i, end; bool reintegrate; lockdep_assert_held(&pcpu_lock); /* * Once a chunk is isolated to the to_depopulate list, the chunk is no * longer discoverable to allocations whom may populate pages. The only * other accessor is the free path which only returns area back to the * allocator not touching the populated bitmap. */ while ((chunk = list_first_entry_or_null( &pcpu_chunk_lists[pcpu_to_depopulate_slot], struct pcpu_chunk, list))) { WARN_ON(chunk->immutable); /* * Scan chunk's pages in the reverse order to keep populated * pages close to the beginning of the chunk. */ freed_page_start = chunk->nr_pages; freed_page_end = 0; reintegrate = false; for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { /* no more work to do */ if (chunk->nr_empty_pop_pages == 0) break; /* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { reintegrate = true; break; } /* * If the page is empty and populated, start or * extend the (i, end) range. If i == 0, decrease * i and perform the depopulation to cover the last * (first) page in the chunk. */ block = chunk->md_blocks + i; if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS && test_bit(i, chunk->populated)) { if (end == -1) end = i; if (i > 0) continue; i--; } /* depopulate if there is an active range */ if (end == -1) continue; spin_unlock_irq(&pcpu_lock); pcpu_depopulate_chunk(chunk, i + 1, end + 1); cond_resched(); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, i + 1, end + 1); freed_page_start = min(freed_page_start, i + 1); freed_page_end = max(freed_page_end, end + 1); /* reset the range and continue */ end = -1; } /* batch tlb flush per chunk to amortize cost */ if (freed_page_start < freed_page_end) { spin_unlock_irq(&pcpu_lock); pcpu_post_unmap_tlb_flush(chunk, freed_page_start, freed_page_end); cond_resched(); spin_lock_irq(&pcpu_lock); } if (reintegrate || chunk->free_bytes == pcpu_unit_size) pcpu_reintegrate_chunk(chunk); else list_move_tail(&chunk->list, &pcpu_chunk_lists[pcpu_sidelined_slot]); } } /** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * For each chunk type, manage the number of fully free chunks and the number of * populated pages. An important thing to consider is when pages are freed and * how they contribute to the global counts. */ static void pcpu_balance_workfn(struct work_struct *work) { /* * pcpu_balance_free() is called twice because the first time we may * trim pages in the active pcpu_nr_empty_pop_pages which may cause us * to grow other chunks. This then gives pcpu_reclaim_populated() time * to move fully free chunks to the active list to be freed if * appropriate. */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); pcpu_balance_free(false); pcpu_reclaim_populated(); pcpu_balance_populated(); pcpu_balance_free(true); spin_unlock_irq(&pcpu_lock); mutex_unlock(&pcpu_alloc_mutex); } /** * pcpu_alloc_size - the size of the dynamic percpu area * @ptr: pointer to the dynamic percpu area * * Returns the size of the @ptr allocation. This is undefined for statically * defined percpu variables as there is no corresponding chunk->bound_map. * * RETURNS: * The size of the dynamic percpu area. * * CONTEXT: * Can be called from atomic context. */ size_t pcpu_alloc_size(void __percpu *ptr) { struct pcpu_chunk *chunk; unsigned long bit_off, end; void *addr; if (!ptr) return 0; addr = __pcpu_ptr_to_addr(ptr); /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */ chunk = pcpu_chunk_addr_search(addr); bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE; end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); return (end - bit_off) * PCPU_MIN_ALLOC_SIZE; } /** * free_percpu - free percpu area * @ptr: pointer to area to free * * Free percpu area @ptr. * * CONTEXT: * Can be called from atomic context. */ void free_percpu(void __percpu *ptr) { void *addr; struct pcpu_chunk *chunk; unsigned long flags; int size, off; bool need_balance = false; if (!ptr) return; kmemleak_free_percpu(ptr); addr = __pcpu_ptr_to_addr(ptr); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; spin_lock_irqsave(&pcpu_lock, flags); size = pcpu_free_area(chunk, off); pcpu_memcg_free_hook(chunk, off, size); /* * If there are more than one fully free chunks, wake up grim reaper. * If the chunk is isolated, it may be in the process of being * reclaimed. Let reclaim manage cleaning up of that chunk. */ if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list) if (pos != chunk) { need_balance = true; break; } } else if (pcpu_should_reclaim_chunk(chunk)) { pcpu_isolate_chunk(chunk); need_balance = true; } trace_percpu_free_percpu(chunk->base_addr, off, ptr); spin_unlock_irqrestore(&pcpu_lock, flags); if (need_balance) pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu); bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) { #ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); void *va = (void *)addr; if (va >= start && va < start + static_size) { if (can_addr) { *can_addr = (unsigned long) (va - start); *can_addr += (unsigned long) per_cpu_ptr(base, get_boot_cpu_id()); } return true; } } #endif /* on UP, can't distinguish from other static vars, always false */ return false; } /** * is_kernel_percpu_address - test whether address is from static percpu area * @addr: address to test * * Test whether @addr belongs to in-kernel static percpu area. Module * static percpu areas are not considered. For those, use * is_module_percpu_address(). * * RETURNS: * %true if @addr is from in-kernel static percpu area, %false otherwise. */ bool is_kernel_percpu_address(unsigned long addr) { return __is_kernel_percpu_address(addr, NULL); } /** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address * * Given @addr which is dereferenceable address obtained via one of * percpu access macros, this function translates it into its physical * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * * percpu allocator has special setup for the first chunk, which currently * supports either embedding in linear address space or vmalloc mapping, * and, from the second one, the backing allocator (currently either vm or * km) provides translation. * * The addr can be translated simply without checking if it falls into the * first chunk. But the current code reflects better how percpu allocator * actually works, and the verification can discover both bugs in percpu * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current * code. * * RETURNS: * The physical address for @addr. */ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; unsigned long first_low, first_high; unsigned int cpu; /* * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. * * The address check is against full chunk sizes. pcpu_base_addr * points to the beginning of the first chunk including the * static region. Assumes good intent as the first chunk may * not be full (ie. < pcpu_unit_pages in size). */ first_low = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); first_high = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); if (addr >= start && addr < start + pcpu_unit_size) { in_first_chunk = true; break; } } } if (in_first_chunk) { if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)) + offset_in_page(addr); } else return page_to_phys(pcpu_addr_to_page(addr)) + offset_in_page(addr); } /** * pcpu_alloc_alloc_info - allocate percpu allocation info * @nr_groups: the number of groups * @nr_units: the number of units * * Allocate ai which is large enough for @nr_groups groups containing * @nr_units units. The returned ai's groups[0].cpu_map points to the * cpu_map array which is long enough for @nr_units and filled with * NR_CPUS. It's the caller's responsibility to initialize cpu_map * pointer of other groups. * * RETURNS: * Pointer to the allocated pcpu_alloc_info on success, NULL on * failure. */ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units) { struct pcpu_alloc_info *ai; size_t base_size, ai_size; void *ptr; int unit; base_size = ALIGN(struct_size(ai, groups, nr_groups), __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; ptr += base_size; ai->groups[0].cpu_map = ptr; for (unit = 0; unit < nr_units; unit++) ai->groups[0].cpu_map[unit] = NR_CPUS; ai->nr_groups = nr_groups; ai->__ai_size = PFN_ALIGN(ai_size); return ai; } /** * pcpu_free_alloc_info - free percpu allocation info * @ai: pcpu_alloc_info to free * * Free @ai which was allocated by pcpu_alloc_alloc_info(). */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { memblock_free(ai, ai->__ai_size); } /** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump * * Print out information about @ai using loglevel @lvl. */ static void pcpu_dump_alloc_info(const char *lvl, const struct pcpu_alloc_info *ai) { int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; int alloc = 0, alloc_end = 0; int group, v; int upa, apl; /* units per alloc, allocs per line */ v = ai->nr_groups; while (v /= 10) group_width++; v = num_possible_cpus(); while (v /= 10) cpu_width++; empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; upa = ai->alloc_size / ai->unit_size; width = upa * (cpu_width + 1) + group_width + 3; apl = rounddown_pow_of_two(max(60 / width, 1)); printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", lvl, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); for (group = 0; group < ai->nr_groups; group++) { const struct pcpu_group_info *gi = &ai->groups[group]; int unit = 0, unit_end = 0; BUG_ON(gi->nr_units % upa); for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) pr_cont("%0*d ", cpu_width, gi->cpu_map[unit]); else pr_cont("%s ", empty_str); } } pr_cont("\n"); } /** * pcpu_setup_first_chunk - initialize the first percpu chunk * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * percpu area. This function is to be called from arch percpu area * setup path. * * @ai contains all information necessary to initialize the first * chunk and prime the dynamic percpu allocator. * * @ai->static_size is the size of static percpu area. * * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu * static areas on architectures where the addressing model has * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * * @ai->dyn_size determines the number of bytes available for dynamic * allocation in the first chunk. The area between @ai->static_size + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE * and equal to or larger than @ai->static_size + @ai->reserved_size + * @ai->dyn_size. * * @ai->atom_size is the allocation atom size and used as alignment * for vm areas. * * @ai->alloc_size is the allocation size and always multiple of * @ai->atom_size. This is larger than @ai->atom_size if * @ai->unit_size is larger than @ai->atom_size. * * @ai->nr_groups and @ai->groups describe virtual memory layout of * percpu areas. Units which should be colocated are put into the * same group. Dynamic VM areas will be allocated according to these * groupings. If @ai->nr_groups is zero, a single group containing * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * * The first chunk will always contain a static and a dynamic region. * However, the static region is not managed by any chunk. If the first * chunk also contains a reserved region, it is served by two chunks - * one for the reserved region and one for the dynamic region. They * share the same vm, but use offset regions in the area allocation map. * The chunk serving the dynamic region is circulated in the chunk slots * and available for dynamic allocation like any other chunk. */ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; unsigned long tmp_addr; size_t alloc_size; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ pr_emerg("failed to initialize, %s\n", #cond); \ pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ } \ } while (0) /* sanity checks */ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); #endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!group_offsets) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!group_sizes) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!unit_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!unit_off) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; pcpu_low_unit_cpu = NR_CPUS; pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; group_offsets[group] = gi->base_offset; group_sizes[group] = gi->nr_units * ai->unit_size; for (i = 0; i < gi->nr_units; i++) { cpu = gi->cpu_map[i]; if (cpu == NR_CPUS) continue; PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; /* determine low/high unit_cpu */ if (pcpu_low_unit_cpu == NR_CPUS || unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) pcpu_low_unit_cpu = cpu; if (pcpu_high_unit_cpu == NR_CPUS || unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; for_each_possible_cpu(cpu) PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); /* we're done parsing the input, undefine BUG macro and dump config */ #undef PCPU_SETUP_BUG_ON pcpu_dump_alloc_info(KERN_DEBUG, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; pcpu_group_sizes = group_sizes; pcpu_unit_map = unit_map; pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated, BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); /* * Allocate chunk slots. The slots after the active slots are: * sidelined_slot - isolated, depopulated chunks * free_slot - fully free chunks * to_depopulate_slot - isolated, chunks to depopulate */ pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); if (!pcpu_chunk_lists) panic("%s: Failed to allocate %zu bytes\n", __func__, pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); /* * The end of the static region needs to be aligned with the * minimum allocation size as this offsets the reserved and * dynamic region. The first chunk ends page aligned by * expanding the dynamic region, therefore the dynamic region * can be shrunk to compensate while still staying above the * configured sizes. */ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); dyn_size = ai->dyn_size - (static_size - ai->static_size); /* * Initialize first chunk: * This chunk is broken up into 3 parts: * < static | [reserved] | dynamic > * - static - there is no backing chunk because these allocations can * never be freed. * - reserved (pcpu_reserved_chunk) - exists primarily to serve * allocations from module load. * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first * chunk. */ tmp_addr = (unsigned long)base_addr + static_size; if (ai->reserved_size) pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr, ai->reserved_size); tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size); pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* include all regions of the first chunk */ pcpu_nr_populated += PFN_DOWN(size_sum); pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); /* we're done */ pcpu_base_addr = base_addr; } #ifdef CONFIG_SMP const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; static int __init percpu_alloc_setup(char *str) { if (!str) return -EINVAL; if (0) /* nada */; #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK else if (!strcmp(str, "embed")) pcpu_chosen_fc = PCPU_FC_EMBED; #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else pr_warn("unknown allocator %s specified\n", str); return 0; } early_param("percpu_alloc", percpu_alloc_setup); /* * pcpu_embed_first_chunk() is used by the generic percpu setup. * Build it if needed by the arch config or the generic setup is going * to be used. */ #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) #define BUILD_EMBED_FIRST_CHUNK #endif /* build pcpu_page_first_chunk() iff needed by the arch config */ #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #define BUILD_PAGE_FIRST_CHUNK #endif /* pcpu_build_alloc_info() is used by both embed and page first chunk */ #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) /** * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * * This function determines grouping of units, their mappings to cpus * and other parameters considering needed percpu size, allocation * atom size and distances between CPUs. * * Groups are always multiples of atom size and CPUs which are of * LOCAL_DISTANCE both ways are grouped together and share space for * units in the same group. The returned configuration is guaranteed * to have CPUs on different nodes on different groups and >=75% usage * of allocated virtual address space. * * RETURNS: * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned. */ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; static struct cpumask mask __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; unsigned int *cpu_map; /* this function may be called multiple times */ memset(group_map, 0, sizeof(group_map)); memset(group_cnt, 0, sizeof(group_cnt)); cpumask_clear(&mask); /* calculate size_sum and ensure dyn_size is enough for early alloc */ size_sum = PFN_ALIGN(static_size + reserved_size + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); dyn_size = size_sum - static_size - reserved_size; /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size. */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); /* determine the maximum # of units that can fit in an allocation */ alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa))) upa--; max_upa = upa; cpumask_copy(&mask, cpu_possible_mask); /* group cpus according to their proximity */ for (group = 0; !cpumask_empty(&mask); group++) { /* pop the group's first cpu */ cpu = cpumask_first(&mask); group_map[cpu] = group; group_cnt[group]++; cpumask_clear_cpu(cpu, &mask); for_each_cpu(tcpu, &mask) { if (!cpu_distance_fn || (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE && cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) { group_map[tcpu] = group; group_cnt[group]++; cpumask_clear_cpu(tcpu, &mask); } } } nr_groups = group; /* * Wasted space is caused by a ratio imbalance of upa to group_cnt. * Expand the unit_size until we use >= 75% of the units allocated. * Related to atom_size, which could be much larger than the unit_size. */ last_allocs = INT_MAX; best_upa = 0; for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue; for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; } /* * Don't accept if wastage is over 1/3. The * greater-than comparison ensures upa==1 always * passes the following check. */ if (wasted > num_possible_cpus() / 3) continue; /* and then don't consume more memory */ if (allocs > last_allocs) break; last_allocs = allocs; best_upa = upa; } BUG_ON(!best_upa); upa = best_upa; /* allocate and fill alloc_info */ for (group = 0; group < nr_groups; group++) nr_units += roundup(group_cnt[group], upa); ai = pcpu_alloc_alloc_info(nr_groups, nr_units); if (!ai) return ERR_PTR(-ENOMEM); cpu_map = ai->groups[0].cpu_map; for (group = 0; group < nr_groups; group++) { ai->groups[group].cpu_map = cpu_map; cpu_map += roundup(group_cnt[group], upa); } ai->static_size = static_size; ai->reserved_size = reserved_size; ai->dyn_size = dyn_size; ai->unit_size = alloc_size / upa; ai->atom_size = atom_size; ai->alloc_size = alloc_size; for (group = 0, unit = 0; group < nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; /* * Initialize base_offset as if all groups are located * back-to-back. The caller should update this to * reflect actual allocation. */ gi->base_offset = unit * ai->unit_size; for_each_possible_cpu(cpu) if (group_map[cpu] == group) gi->cpu_map[gi->nr_units++] = cpu; gi->nr_units = roundup(gi->nr_units, upa); unit += gi->nr_units; } BUG_ON(unit != nr_units); return ai; } static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA int node = NUMA_NO_NODE; void *ptr; if (cpu_to_nd_fn) node = cpu_to_nd_fn(cpu); if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) { ptr = memblock_alloc_from(size, align, goal); pr_info("cpu %d has no node %d or node-local memory\n", cpu, node); pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n", cpu, size, (u64)__pa(ptr)); } else { ptr = memblock_alloc_try_nid(size, align, goal, MEMBLOCK_ALLOC_ACCESSIBLE, node); pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n", cpu, size, node, (u64)__pa(ptr)); } return ptr; #else return memblock_alloc_from(size, align, goal); #endif } static void __init pcpu_fc_free(void *ptr, size_t size) { memblock_free(ptr, size); } #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ #if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated * by calling pcpu_fc_alloc and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * * This enables the first chunk to piggy back on the linear physical * mapping which often uses larger page size. Please note that this * can result in very sparse cpu->unit mapping on NUMA machines thus * requiring large vmalloc address space. Don't use this allocator if * vmalloc space is not orders of magnitude larger than distances * between node memory addresses (ie. 32bit NUMA machines). * * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned using pcpu_fc_free. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; size_t size_sum, areas_size; unsigned long max_distance; int group, i, highest_group, rc = 0; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); areas = memblock_alloc(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; } /* allocate, copy and determine base address & max_distance */ highest_group = 0; for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; unsigned int cpu = NR_CPUS; void *ptr; for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) cpu = gi->cpu_map[i]; BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; } /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(ptr)); areas[group] = ptr; base = min(ptr, base); if (ptr > areas[highest_group]) highest_group = group; } max_distance = areas[highest_group] - base; max_distance += ai->unit_size * ai->groups[highest_group].nr_units; /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n", max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; goto out_free_areas; #endif } /* * Copy data and free unused parts. This should happen after all * allocations are complete; otherwise, we may end up with * overlapping groups. */ for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; void *ptr = areas[group]; for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */ pcpu_fc_free(ptr, ai->unit_size); continue; } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } /* base address is now known, determine group base offsets */ for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; } pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); pcpu_setup_first_chunk(ai, base); goto out_free; out_free_areas: for (group = 0; group < ai->nr_groups; group++) if (areas[group]) pcpu_fc_free(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) memblock_free(areas, areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK #include <asm/pgalloc.h> #ifndef P4D_TABLE_SIZE #define P4D_TABLE_SIZE PAGE_SIZE #endif #ifndef PUD_TABLE_SIZE #define PUD_TABLE_SIZE PAGE_SIZE #endif #ifndef PMD_TABLE_SIZE #define PMD_TABLE_SIZE PAGE_SIZE #endif #ifndef PTE_TABLE_SIZE #define PTE_TABLE_SIZE PAGE_SIZE #endif void __init __weak pcpu_populate_pte(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; if (pgd_none(*pgd)) { p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); if (!p4d) goto err_alloc; pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); if (!pud) goto err_alloc; p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); if (!pmd) goto err_alloc; pud_populate(&init_mm, pud, pmd); } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) { pte_t *new; new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); if (!new) goto err_alloc; pmd_populate_kernel(&init_mm, pmd, new); } return; err_alloc: panic("%s: Failed to allocate memory\n", __func__); } /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; int unit, i, j, rc = 0; int upa; int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); upa = ai->alloc_size/ai->unit_size; nr_g0_units = roundup(num_possible_cpus(), upa); if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { pcpu_free_alloc_info(ai); return -EINVAL; } unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); if (!pages) panic("%s: Failed to allocate %zu bytes\n", __func__, pages_size); /* allocate pages */ j = 0; for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { void *ptr; ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(ptr)); pages[j++] = virt_to_page(ptr); } } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (rc < 0) panic("failed to map percpu area, err=%d\n", rc); flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) pcpu_fc_free(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: memblock_free(pages, pages_size); pcpu_free_alloc_info(ai); return rc; } #endif /* BUILD_PAGE_FIRST_CHUNK */ #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is * important because many archs have addressing restrictions and might * fail if the percpu area is located far away from the previous * location. As an added bonus, in non-NUMA cases, embedding is * generally a good idea TLB-wise because percpu area can piggy back * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, NULL); if (rc < 0) panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ #else /* CONFIG_SMP */ /* * UP percpu area setup. * * UP always uses km-based percpu allocator with identity mapping. * Static percpu variables are indistinguishable from the usual static * variables and don't require any special preparation. */ void __init setup_per_cpu_areas(void) { const size_t unit_size = roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, PERCPU_DYNAMIC_RESERVE)); struct pcpu_alloc_info *ai; void *fc; ai = pcpu_alloc_alloc_info(1, 1); fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(fc)); ai->dyn_size = unit_size; ai->unit_size = unit_size; ai->atom_size = unit_size; ai->alloc_size = unit_size; ai->groups[0].nr_units = 1; ai->groups[0].cpu_map[0] = 0; pcpu_setup_first_chunk(ai, fc); pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ /* * pcpu_nr_pages - calculate total number of populated backing pages * * This reflects the number of pages populated to back chunks. Metadata is * excluded in the number exposed in meminfo as the number of backing pages * scales with the number of cpus and can quickly outweigh the memory used for * metadata. It also keeps this calculation nice and simple. * * RETURNS: * Total number of populated backing pages in use by the allocator. */ unsigned long pcpu_nr_pages(void) { return pcpu_nr_populated * pcpu_nr_units; } /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. */ static int __init percpu_enable_async(void) { pcpu_async_enabled = true; return 0; } subsys_initcall(percpu_enable_async); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 | // SPDX-License-Identifier: GPL-2.0 /* net/atm/svc.c - ATM SVC sockets */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/string.h> #include <linux/net.h> /* struct socket, struct proto_ops */ #include <linux/errno.h> /* error codes */ #include <linux/kernel.h> /* printk */ #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/fcntl.h> /* O_NONBLOCK */ #include <linux/init.h> #include <linux/atm.h> /* ATM stuff */ #include <linux/atmsap.h> #include <linux/atmsvc.h> #include <linux/atmdev.h> #include <linux/bitops.h> #include <net/sock.h> /* for sock_no_* */ #include <linux/uaccess.h> #include <linux/export.h> #include "resources.h" #include "common.h" /* common for PVCs and SVCs */ #include "signaling.h" #include "addr.h" #ifdef CONFIG_COMPAT /* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */ #define COMPAT_ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL + 4, struct compat_atm_iobuf) #endif static int svc_create(struct net *net, struct socket *sock, int protocol, int kern); /* * Note: since all this is still nicely synchronized with the signaling demon, * there's no need to protect sleep loops with clis. If signaling is * moved into the kernel, that would change. */ static int svc_shutdown(struct socket *sock, int how) { return 0; } static void svc_disconnect(struct atm_vcc *vcc) { DEFINE_WAIT(wait); struct sk_buff *skb; struct sock *sk = sk_atm(vcc); pr_debug("%p\n", vcc); if (test_bit(ATM_VF_REGIS, &vcc->flags)) { sigd_enq(vcc, as_close, NULL, NULL, NULL); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); } /* beware - socket is still in use by atmsigd until the last as_indicate has been answered */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { atm_return(vcc, skb->truesize); pr_debug("LISTEN REL\n"); sigd_enq2(NULL, as_reject, vcc, NULL, NULL, &vcc->qos, 0); dev_kfree_skb(skb); } clear_bit(ATM_VF_REGIS, &vcc->flags); /* ... may retry later */ } static int svc_release(struct socket *sock) { struct sock *sk = sock->sk; struct atm_vcc *vcc; if (sk) { vcc = ATM_SD(sock); pr_debug("%p\n", vcc); clear_bit(ATM_VF_READY, &vcc->flags); /* * VCC pointer is used as a reference, * so we must not free it (thereby subjecting it to re-use) * before all pending connections are closed */ svc_disconnect(vcc); vcc_release(sock); } return 0; } static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct sockaddr_atmsvc *addr; struct atm_vcc *vcc; int error; if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) return -EINVAL; lock_sock(sk); if (sock->state == SS_CONNECTED) { error = -EISCONN; goto out; } if (sock->state != SS_UNCONNECTED) { error = -EINVAL; goto out; } vcc = ATM_SD(sock); addr = (struct sockaddr_atmsvc *) sockaddr; if (addr->sas_family != AF_ATMSVC) { error = -EAFNOSUPPORT; goto out; } clear_bit(ATM_VF_BOUND, &vcc->flags); /* failing rebind will kill old binding */ /* @@@ check memory (de)allocation on rebind */ if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } vcc->local = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */ if (!sigd) { error = -EUNATCH; goto out; } if (!sk->sk_err) set_bit(ATM_VF_BOUND, &vcc->flags); error = -sk->sk_err; out: release_sock(sk); return error; } static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct sockaddr_atmsvc *addr; struct atm_vcc *vcc = ATM_SD(sock); int error; pr_debug("%p\n", vcc); lock_sock(sk); if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) { error = -EINVAL; goto out; } switch (sock->state) { default: error = -EINVAL; goto out; case SS_CONNECTED: error = -EISCONN; goto out; case SS_CONNECTING: if (test_bit(ATM_VF_WAITING, &vcc->flags)) { error = -EALREADY; goto out; } sock->state = SS_UNCONNECTED; if (sk->sk_err) { error = -sk->sk_err; goto out; } break; case SS_UNCONNECTED: addr = (struct sockaddr_atmsvc *) sockaddr; if (addr->sas_family != AF_ATMSVC) { error = -EAFNOSUPPORT; goto out; } if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) { error = -EINVAL; goto out; } if (!vcc->qos.txtp.traffic_class && !vcc->qos.rxtp.traffic_class) { error = -EINVAL; goto out; } vcc->remote = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote); if (flags & O_NONBLOCK) { sock->state = SS_CONNECTING; error = -EINPROGRESS; goto out; } error = 0; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { schedule(); if (!signal_pending(current)) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); continue; } pr_debug("*ABORT*\n"); /* * This is tricky: * Kernel ---close--> Demon * Kernel <--close--- Demon * or * Kernel ---close--> Demon * Kernel <--error--- Demon * or * Kernel ---close--> Demon * Kernel <--okay---- Demon * Kernel <--close--- Demon */ sigd_enq(vcc, as_close, NULL, NULL, NULL); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); schedule(); } if (!sk->sk_err) while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); schedule(); } clear_bit(ATM_VF_REGIS, &vcc->flags); clear_bit(ATM_VF_RELEASED, &vcc->flags); clear_bit(ATM_VF_CLOSE, &vcc->flags); /* we're gone now but may connect later */ error = -EINTR; break; } finish_wait(sk_sleep(sk), &wait); if (error) goto out; if (!sigd) { error = -EUNATCH; goto out; } if (sk->sk_err) { error = -sk->sk_err; goto out; } } vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp); vcc->qos.txtp.pcr = 0; vcc->qos.txtp.min_pcr = 0; error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci); if (!error) sock->state = SS_CONNECTED; else (void)svc_disconnect(vcc); out: release_sock(sk); return error; } static int svc_listen(struct socket *sock, int backlog) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; pr_debug("%p\n", vcc); lock_sock(sk); /* let server handle listen on unbound sockets */ if (test_bit(ATM_VF_SESSION, &vcc->flags)) { error = -EINVAL; goto out; } if (test_bit(ATM_VF_LISTEN, &vcc->flags)) { error = -EADDRINUSE; goto out; } set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { error = -EUNATCH; goto out; } set_bit(ATM_VF_LISTEN, &vcc->flags); vcc_insert_socket(sk); sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT; error = -sk->sk_err; out: release_sock(sk); return error; } static int svc_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; struct sk_buff *skb; struct atmsvc_msg *msg; struct atm_vcc *old_vcc = ATM_SD(sock); struct atm_vcc *new_vcc; int error; lock_sock(sk); error = svc_create(sock_net(sk), newsock, 0, kern); if (error) goto out; new_vcc = ATM_SD(newsock); pr_debug("%p -> %p\n", old_vcc, new_vcc); while (1) { DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (!(skb = skb_dequeue(&sk->sk_receive_queue)) && sigd) { if (test_bit(ATM_VF_RELEASED, &old_vcc->flags)) break; if (test_bit(ATM_VF_CLOSE, &old_vcc->flags)) { error = -sk->sk_err; break; } if (flags & O_NONBLOCK) { error = -EAGAIN; break; } release_sock(sk); schedule(); lock_sock(sk); if (signal_pending(current)) { error = -ERESTARTSYS; break; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); if (error) goto out; if (!skb) { error = -EUNATCH; goto out; } msg = (struct atmsvc_msg *)skb->data; new_vcc->qos = msg->qos; set_bit(ATM_VF_HASQOS, &new_vcc->flags); new_vcc->remote = msg->svc; new_vcc->local = msg->local; new_vcc->sap = msg->sap; error = vcc_connect(newsock, msg->pvc.sap_addr.itf, msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); dev_kfree_skb(skb); sk_acceptq_removed(sk); if (error) { sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, &old_vcc->qos, error); error = error == -EAGAIN ? -EBUSY : error; goto out; } /* wait should be short, so we ignore the non-blocking flag */ set_bit(ATM_VF_WAITING, &new_vcc->flags); sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL); for (;;) { prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd) break; release_sock(sk); schedule(); lock_sock(sk); } finish_wait(sk_sleep(sk_atm(new_vcc)), &wait); if (!sigd) { error = -EUNATCH; goto out; } if (!sk_atm(new_vcc)->sk_err) break; if (sk_atm(new_vcc)->sk_err != ERESTARTSYS) { error = -sk_atm(new_vcc)->sk_err; goto out; } } newsock->state = SS_CONNECTED; out: release_sock(sk); return error; } static int svc_getname(struct socket *sock, struct sockaddr *sockaddr, int peer) { struct sockaddr_atmsvc *addr; addr = (struct sockaddr_atmsvc *) sockaddr; memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local, sizeof(struct sockaddr_atmsvc)); return sizeof(struct sockaddr_atmsvc); } int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) { struct sock *sk = sk_atm(vcc); DEFINE_WAIT(wait); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) { break; } schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) return -EUNATCH; return -sk->sk_err; } static int svc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int value, error = 0; lock_sock(sk); switch (optname) { case SO_ATMSAP: if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) { error = -EINVAL; goto out; } if (copy_from_sockptr(&vcc->sap, optval, optlen)) { error = -EFAULT; goto out; } set_bit(ATM_VF_HASSAP, &vcc->flags); break; case SO_MULTIPOINT: if (level != SOL_ATM || optlen != sizeof(int)) { error = -EINVAL; goto out; } if (copy_from_sockptr(&value, optval, sizeof(int))) { error = -EFAULT; goto out; } if (value == 1) set_bit(ATM_VF_SESSION, &vcc->flags); else if (value == 0) clear_bit(ATM_VF_SESSION, &vcc->flags); else error = -EINVAL; break; default: error = vcc_setsockopt(sock, level, optname, optval, optlen); } out: release_sock(sk); return error; } static int svc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int error = 0, len; lock_sock(sk); if (!__SO_LEVEL_MATCH(optname, level) || optname != SO_ATMSAP) { error = vcc_getsockopt(sock, level, optname, optval, optlen); goto out; } if (get_user(len, optlen)) { error = -EFAULT; goto out; } if (len != sizeof(struct atm_sap)) { error = -EINVAL; goto out; } if (copy_to_user(optval, &ATM_SD(sock)->sap, sizeof(struct atm_sap))) { error = -EFAULT; goto out; } out: release_sock(sk); return error; } static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_addparty, NULL, NULL, (struct sockaddr_atmsvc *) sockaddr); if (flags & O_NONBLOCK) { error = -EINPROGRESS; goto out; } pr_debug("added wait queue\n"); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; } static int svc_dropparty(struct socket *sock, int ep_ref) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { error = -EUNATCH; goto out; } error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; } static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int error, ep_ref; struct sockaddr_atmsvc sa; struct atm_vcc *vcc = ATM_SD(sock); switch (cmd) { case ATM_ADDPARTY: if (!test_bit(ATM_VF_SESSION, &vcc->flags)) return -EINVAL; if (copy_from_user(&sa, (void __user *) arg, sizeof(sa))) return -EFAULT; error = svc_addparty(sock, (struct sockaddr *)&sa, sizeof(sa), 0); break; case ATM_DROPPARTY: if (!test_bit(ATM_VF_SESSION, &vcc->flags)) return -EINVAL; if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int))) return -EFAULT; error = svc_dropparty(sock, ep_ref); break; default: error = vcc_ioctl(sock, cmd, arg); } return error; } #ifdef CONFIG_COMPAT static int svc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf. But actually it takes a struct sockaddr_atmsvc, which doesn't need compat handling. So all we have to do is fix up cmd... */ if (cmd == COMPAT_ATM_ADDPARTY) cmd = ATM_ADDPARTY; if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY) return svc_ioctl(sock, cmd, arg); else return vcc_compat_ioctl(sock, cmd, arg); } #endif /* CONFIG_COMPAT */ static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, .release = svc_release, .bind = svc_bind, .connect = svc_connect, .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, .poll = vcc_poll, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, #endif .gettstamp = sock_gettstamp, .listen = svc_listen, .shutdown = svc_shutdown, .setsockopt = svc_setsockopt, .getsockopt = svc_getsockopt, .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, }; static int svc_create(struct net *net, struct socket *sock, int protocol, int kern) { int error; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; sock->ops = &svc_proto_ops; error = vcc_create(net, sock, protocol, AF_ATMSVC, kern); if (error) return error; ATM_SD(sock)->local.sas_family = AF_ATMSVC; ATM_SD(sock)->remote.sas_family = AF_ATMSVC; return 0; } static const struct net_proto_family svc_family_ops = { .family = PF_ATMSVC, .create = svc_create, .owner = THIS_MODULE, }; /* * Initialize the ATM SVC protocol family */ int __init atmsvc_init(void) { return sock_register(&svc_family_ops); } void atmsvc_exit(void) { sock_unregister(PF_ATMSVC); } |
11 4 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_DRM_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _DRM_TRACE_H_ #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> struct drm_file; #undef TRACE_SYSTEM #define TRACE_SYSTEM drm #define TRACE_INCLUDE_FILE drm_trace TRACE_EVENT(drm_vblank_event, TP_PROTO(int crtc, unsigned int seq, ktime_t time, bool high_prec), TP_ARGS(crtc, seq, time, high_prec), TP_STRUCT__entry( __field(int, crtc) __field(unsigned int, seq) __field(ktime_t, time) __field(bool, high_prec) ), TP_fast_assign( __entry->crtc = crtc; __entry->seq = seq; __entry->time = time; __entry->high_prec = high_prec; ), TP_printk("crtc=%d, seq=%u, time=%lld, high-prec=%s", __entry->crtc, __entry->seq, __entry->time, __entry->high_prec ? "true" : "false") ); TRACE_EVENT(drm_vblank_event_queued, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); TRACE_EVENT(drm_vblank_event_delivered, TP_PROTO(struct drm_file *file, int crtc, unsigned int seq), TP_ARGS(file, crtc, seq), TP_STRUCT__entry( __field(struct drm_file *, file) __field(int, crtc) __field(unsigned int, seq) ), TP_fast_assign( __entry->file = file; __entry->crtc = crtc; __entry->seq = seq; ), TP_printk("file=%p, crtc=%d, seq=%u", __entry->file, __entry->crtc, \ __entry->seq) ); #endif /* _DRM_TRACE_H_ */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../drivers/gpu/drm #include <trace/define_trace.h> |
14 14 159 146 14 179 182 8 1 1 1 1 8 3 5 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/namespace.c * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc. */ #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/rcupdate.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include "util.h" /* * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. */ static void free_ipc(struct work_struct *unused); static DECLARE_WORK(free_ipc_work, free_ipc); static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); } static void dec_ipc_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES); } static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; again: ucounts = inc_ipc_namespaces(user_ns); if (!ucounts) { /* * IPC namespaces are freed asynchronously, by free_ipc_work. * If frees were pending, flush_work will wait, and * return true. Fail the allocation if no frees are pending. */ if (flush_work(&free_ipc_work)) goto again; goto fail; } err = -ENOMEM; ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ns.ops = &ipcns_operations; refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; err = mq_init_ns(ns); if (err) goto fail_put; err = -ENOMEM; if (!setup_mq_sysctls(ns)) goto fail_put; if (!setup_ipc_sysctls(ns)) goto fail_mq; err = msg_init_ns(ns); if (err) goto fail_put; sem_init_ns(ns); shm_init_ns(ns); return ns; fail_mq: retire_mq_sysctls(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); fail_free: kfree(ns); fail_dec: dec_ipc_namespaces(ucounts); fail: return ERR_PTR(err); } struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); return create_ipc_ns(user_ns, ns); } /* * free_ipcs - free all ipcs of one type * @ns: the namespace to remove the ipcs from * @ids: the table of ipcs to free * @free: the function called to free each individual ipc * * Called for each kind of ipc when an ipc_namespace exits. */ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)) { struct kern_ipc_perm *perm; int next_id; int total, in_use; down_write(&ids->rwsem); in_use = ids->in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); total++; } up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) { /* * Caller needs to wait for an RCU grace period to have passed * after making the mount point inaccessible to new accesses. */ mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); retire_mq_sysctls(ns); retire_ipc_sysctls(ns); dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } static LLIST_HEAD(free_ipc_list); static void free_ipc(struct work_struct *unused) { struct llist_node *node = llist_del_all(&free_ipc_list); struct ipc_namespace *n, *t; llist_for_each_entry_safe(n, t, node, mnt_llist) mnt_make_shortterm(n->mq_mnt); /* Wait for any last users to have gone away. */ synchronize_rcu(); llist_for_each_entry_safe(n, t, node, mnt_llist) free_ipc_ns(n); } /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put * * If this is the last task in the namespace exiting, and * it is dropping the refcount to 0, then it can race with * a task in another ipc namespace but in a mounts namespace * which has this ipcns's mqueuefs mounted, doing some action * with one of the mqueuefs files. That can raise the refcount. * So dropping the refcount, and raising the refcount when * accessing it through the VFS, are protected with mq_lock. * * (Clearly, a task raising the refcount on its own ipc_ns * needn't take mq_lock since it can't race with the last task * in the ipcns exiting). */ void put_ipc_ns(struct ipc_namespace *ns) { if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } } static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) { return container_of(ns, struct ipc_namespace, ns); } static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); task_unlock(task); return ns ? &ns->ns : NULL; } static void ipcns_put(struct ns_common *ns) { return put_ipc_ns(to_ipc_ns(ns)); } static int ipcns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; } static struct user_namespace *ipcns_owner(struct ns_common *ns) { return to_ipc_ns(ns)->user_ns; } const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, .owner = ipcns_owner, }; |
16 12 10 6 13 13 1 1 5 5 5 5 13 13 13 6 8 1 1 13 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_health.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", .len = 2, .type = XFS_DIR3_FT_DIR, }; const struct xfs_name xfs_name_dot = { .name = (const unsigned char *)".", .len = 1, .type = XFS_DIR3_FT_DIR, }; /* * Convert inode mode to directory entry filetype */ unsigned char xfs_mode_to_ftype( int mode) { switch (mode & S_IFMT) { case S_IFREG: return XFS_DIR3_FT_REG_FILE; case S_IFDIR: return XFS_DIR3_FT_DIR; case S_IFCHR: return XFS_DIR3_FT_CHRDEV; case S_IFBLK: return XFS_DIR3_FT_BLKDEV; case S_IFIFO: return XFS_DIR3_FT_FIFO; case S_IFSOCK: return XFS_DIR3_FT_SOCK; case S_IFLNK: return XFS_DIR3_FT_SYMLINK; default: return XFS_DIR3_FT_UNKNOWN; } } /* * ASCII case-insensitive (ie. A-Z) support for directories that was * used in IRIX. */ xfs_dahash_t xfs_ascii_ci_hashname( const struct xfs_name *name) { xfs_dahash_t hash; int i; for (i = 0, hash = 0; i < name->len; i++) hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7); return hash; } enum xfs_dacmp xfs_ascii_ci_compname( struct xfs_da_args *args, const unsigned char *name, int len) { enum xfs_dacmp result; int i; if (args->namelen != len) return XFS_CMP_DIFFERENT; result = XFS_CMP_EXACT; for (i = 0; i < len; i++) { if (args->name[i] == name[i]) continue; if (xfs_ascii_ci_xfrm(args->name[i]) != xfs_ascii_ci_xfrm(name[i])) return XFS_CMP_DIFFERENT; result = XFS_CMP_CASE; } return result; } int xfs_da_mount( struct xfs_mount *mp) { struct xfs_da_geometry *dageo; ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry), GFP_KERNEL | __GFP_RETRY_MAYFAIL); mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { kfree(mp->m_dir_geo); kfree(mp->m_attr_geo); return -ENOMEM; } /* set up directory geometry */ dageo = mp->m_dir_geo; dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; if (xfs_has_crc(mp)) { dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); dageo->data_entry_offset = sizeof(struct xfs_dir3_data_hdr); } else { dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr); dageo->data_entry_offset = sizeof(struct xfs_dir2_data_hdr); } dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) / sizeof(struct xfs_dir2_leaf_entry); dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) / sizeof(xfs_dir2_data_off_t); dageo->data_first_offset = dageo->data_entry_offset + xfs_dir2_data_entsize(mp, 1) + xfs_dir2_data_entsize(mp, 2); /* * Now we've set up the block conversion variables, we can calculate the * segment block constants using the geometry structure. */ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >> mp->m_sb.sb_blocklog; dageo->magicpct = (dageo->blksize * 37) / 100; /* set up attribute geometry - single fsb only */ dageo = mp->m_attr_geo; dageo->blklog = mp->m_sb.sb_blocklog; dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = 1 << dageo->blklog; dageo->fsbcount = 1; dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size; dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) / (uint)sizeof(xfs_da_node_entry_t); if (xfs_has_large_extent_counts(mp)) dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE; else dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL; dageo->magicpct = (dageo->blksize * 37) / 100; return 0; } void xfs_da_unmount( struct xfs_mount *mp) { kfree(mp->m_dir_geo); kfree(mp->m_attr_geo); } /* * Return 1 if directory contains only "." and "..". */ int xfs_dir_isempty( xfs_inode_t *dp) { xfs_dir2_sf_hdr_t *sfp; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; sfp = dp->i_df.if_data; return !sfp->count; } /* * Validate a given inode number. */ int xfs_dir_ino_validate( xfs_mount_t *mp, xfs_ino_t ino) { bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; } return 0; } /* * Initialize a directory with its "." and ".." entries. */ int xfs_dir_init( xfs_trans_t *tp, xfs_inode_t *dp, xfs_inode_t *pdp) { struct xfs_da_args *args; int error; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); if (error) return error; args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->dp = dp; args->trans = tp; error = xfs_dir2_sf_create(args, pdp->i_ino); kfree(args); return error; } /* * Enter a name in a directory, or check for available space. * If inum is 0, only the available space test is performed. */ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (inum) { rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; XFS_STATS_INC(dp->i_mount, xs_dir_create); } args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; if (!inum) args->op_flags |= XFS_DA_OP_JUSTCHECK; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_addname(args); goto out_free; } rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { rval = xfs_dir2_block_addname(args); goto out_free; } rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) rval = xfs_dir2_leaf_addname(args); else rval = xfs_dir2_node_addname(args); out_free: kfree(args); return rval; } /* * If doing a CI lookup and case-insensitive match, dup actual name into * args.value. Return EEXIST for success (ie. name found) or an error. */ int xfs_dir_cilookup_result( struct xfs_da_args *args, const unsigned char *name, int len) { if (args->cmpresult == XFS_CMP_DIFFERENT) return -ENOENT; if (args->cmpresult != XFS_CMP_CASE || !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; args->value = kmalloc(len, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; memcpy(args->value, name, len); args->valuelen = len; return -EEXIST; } /* * Lookup a name in a directory, give back the inode number. * If ci_name is not NULL, returns the actual name in ci_name if it differs * to name, or ci_name->name is set to NULL for an exact match. */ int xfs_dir_lookup( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, xfs_ino_t *inum, /* out: inode number */ struct xfs_name *ci_name) /* out: actual name if CI match */ { struct xfs_da_args *args; int rval; bool v; int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->dp = dp; args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_OKNOENT; if (ci_name) args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; } rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { rval = xfs_dir2_block_lookup(args); goto out_check_rval; } rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) rval = xfs_dir2_leaf_lookup(args); else rval = xfs_dir2_node_lookup(args); out_check_rval: if (rval == -EEXIST) rval = 0; if (!rval) { *inum = args->inumber; if (ci_name) { ci_name->name = args->value; ci_name->len = args->valuelen; } } out_free: xfs_iunlock(dp, lock_mode); kfree(args); return rval; } /* * Remove an entry from a directory. */ int xfs_dir_removename( struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = ino; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_removename(args); goto out_free; } rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { rval = xfs_dir2_block_removename(args); goto out_free; } rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) rval = xfs_dir2_leaf_removename(args); else rval = xfs_dir2_node_removename(args); out_free: kfree(args); return rval; } /* * Replace the inode number of a directory entry. */ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { struct xfs_da_args *args; int rval; bool v; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; args->filetype = name->type; args->hashval = xfs_dir2_hashname(dp->i_mount, name); args->inumber = inum; args->dp = dp; args->total = total; args->whichfork = XFS_DATA_FORK; args->trans = tp; if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_replace(args); goto out_free; } rval = xfs_dir2_isblock(args, &v); if (rval) goto out_free; if (v) { rval = xfs_dir2_block_replace(args); goto out_free; } rval = xfs_dir2_isleaf(args, &v); if (rval) goto out_free; if (v) rval = xfs_dir2_leaf_replace(args); else rval = xfs_dir2_node_replace(args); out_free: kfree(args); return rval; } /* * See if this entry can be added to the directory without allocating space. */ int xfs_dir_canenter( xfs_trans_t *tp, xfs_inode_t *dp, struct xfs_name *name) /* name of entry to add */ { return xfs_dir_createname(tp, dp, name, 0, 0); } /* * Utility routines. */ /* * Add a block to the directory. * * This routine is for data and free blocks, not leaf/node blocks which are * handled by xfs_da_grow_inode. */ int xfs_dir2_grow_inode( struct xfs_da_args *args, int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ xfs_dir2_db_t *dbp) /* out: block number added */ { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; xfs_fileoff_t bno; /* directory offset of new block */ int count; /* count of filesystem blocks */ int error; trace_xfs_dir2_grow_inode(args, space); /* * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); count = args->geo->fsbcount; error = xfs_da_grow_inode_int(args, &bno, count); if (error) return error; *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); /* * Update file's size if this is the data space and it grew. */ if (space == XFS_DIR2_DATA_SPACE) { xfs_fsize_t size; /* directory file (data) size */ size = XFS_FSB_TO_B(mp, bno + count); if (size > dp->i_disk_size) { dp->i_disk_size = size; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); } } return 0; } /* * See if the directory is a single-block form directory. */ int xfs_dir2_isblock( struct xfs_da_args *args, bool *isblock) { struct xfs_mount *mp = args->dp->i_mount; xfs_fileoff_t eof; int error; error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); if (error) return error; *isblock = false; if (XFS_FSB_TO_B(mp, eof) != args->geo->blksize) return 0; *isblock = true; if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) { xfs_da_mark_sick(args); return -EFSCORRUPTED; } return 0; } /* * See if the directory is a single-leaf form directory. */ int xfs_dir2_isleaf( struct xfs_da_args *args, bool *isleaf) { xfs_fileoff_t eof; int error; error = xfs_bmap_last_offset(args->dp, &eof, XFS_DATA_FORK); if (error) return error; *isleaf = false; if (eof != args->geo->leafblk + args->geo->fsbcount) return 0; *isleaf = true; return 0; } /* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. */ int xfs_dir2_shrink_inode( struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp) { xfs_fileoff_t bno; /* directory file offset */ xfs_dablk_t da; /* directory file offset */ int done; /* bunmap is finished */ struct xfs_inode *dp; int error; struct xfs_mount *mp; struct xfs_trans *tp; trace_xfs_dir2_shrink_inode(args, db); dp = args->dp; mp = dp->i_mount; tp = args->trans; da = xfs_dir2_db_to_da(args->geo, db); /* Unmap the fsblock(s). */ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0, &done); if (error) { /* * ENOSPC actually can happen if we're in a removename with no * space reservation, and the resulting block removal would * cause a bmap btree split or conversion from extents to btree. * This can only happen for un-fragmented directory blocks, * since you need to be punching out the middle of an extent. * In this case we need to leave the block in the file, and not * binval it. So the block has to be in a consistent empty * state and appropriately logged. We don't free up the buffer, * the caller can tell it hasn't happened since it got an error * back. */ return error; } ASSERT(done); /* * Invalidate the buffer from the transaction. */ xfs_trans_binval(tp, bp); /* * If it's not a data block, we're done. */ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) return 0; /* * If the block isn't the last one in the directory, we're done. */ if (dp->i_disk_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { /* * This can't really happen unless there's kernel corruption. */ return error; } if (db == args->geo->datablk) ASSERT(bno == 0); else ASSERT(bno > 0); /* * Set the size to the new last block. */ dp->i_disk_size = XFS_FSB_TO_B(mp, bno); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } /* Returns true if the directory entry name is valid. */ bool xfs_dir2_namecheck( const void *name, size_t length) { /* * MAXNAMELEN includes the trailing null, but (name/length) leave it * out, so use >= for the length check. */ if (length >= MAXNAMELEN) return false; /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, const struct xfs_name *name) { if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); return xfs_da_hashname(name->name, name->len); } enum xfs_dacmp xfs_dir2_compname( struct xfs_da_args *args, const unsigned char *name, int len) { if (unlikely(xfs_has_asciici(args->dp->i_mount))) return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } |
1 1 5 1 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | // SPDX-License-Identifier: GPL-2.0-only /* * Authors: * (C) 2020 Alexander Aring <alex.aring@gmail.com> */ #include <linux/rpl_iptunnel.h> #include <net/dst_cache.h> #include <net/ip6_route.h> #include <net/lwtunnel.h> #include <net/ipv6.h> #include <net/rpl.h> struct rpl_iptunnel_encap { DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh); }; struct rpl_lwt { struct dst_cache cache; struct rpl_iptunnel_encap tuninfo; }; static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct rpl_lwt *)lwt->data; } static inline struct rpl_iptunnel_encap * rpl_encap_lwtunnel(struct lwtunnel_state *lwt) { return &rpl_lwt_lwtunnel(lwt)->tuninfo; } static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = { [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY }, }; static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh, size_t seglen) { int err; if ((srh->hdrlen << 3) != seglen) return false; /* check at least one segment and seglen fit with segments_left */ if (!srh->segments_left || (srh->segments_left * sizeof(struct in6_addr)) != seglen) return false; if (srh->cmpri || srh->cmpre) return false; err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr, srh->segments_left); if (err) return false; if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) & IPV6_ADDR_MULTICAST) return false; return true; } static int rpl_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[RPL_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; struct ipv6_rpl_sr_hdr *srh; struct rpl_lwt *rlwt; int err, srh_len; if (family != AF_INET6) return -EINVAL; err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla, rpl_iptunnel_policy, extack); if (err < 0) return err; if (!tb[RPL_IPTUNNEL_SRH]) return -EINVAL; srh = nla_data(tb[RPL_IPTUNNEL_SRH]); srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]); if (srh_len < sizeof(*srh)) return -EINVAL; /* verify that SRH is consistent */ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh))) return -EINVAL; newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt)); if (!newts) return -ENOMEM; rlwt = rpl_lwt_lwtunnel(newts); err = dst_cache_init(&rlwt->cache, GFP_ATOMIC); if (err) { kfree(newts); return err; } memcpy(&rlwt->tuninfo.srh, srh, srh_len); newts->type = LWTUNNEL_ENCAP_RPL; newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; *ts = newts; return 0; } static void rpl_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache); } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, const struct ipv6_rpl_sr_hdr *srh) { struct ipv6_rpl_sr_hdr *isrh, *csrh; const struct ipv6hdr *oldhdr; struct ipv6hdr *hdr; unsigned char *buf; size_t hdrlen; int err; oldhdr = ipv6_hdr(skb); buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC); if (!buf) return -ENOMEM; isrh = (struct ipv6_rpl_sr_hdr *)buf; csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3)); memcpy(isrh, srh, sizeof(*isrh)); memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1], (srh->segments_left - 1) * 16); isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr; ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0], isrh->segments_left - 1); hdrlen = ((csrh->hdrlen + 1) << 3); err = skb_cow_head(skb, hdrlen + skb->mac_len); if (unlikely(err)) { kfree(buf); return err; } skb_pull(skb, sizeof(struct ipv6hdr)); skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(struct ipv6hdr)); skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); memmove(hdr, oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, csrh, hdrlen); isrh->nexthdr = hdr->nexthdr; hdr->nexthdr = NEXTHDR_ROUTING; hdr->daddr = srh->rpl_segaddr[0]; ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); kfree(buf); return 0; } static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; tinfo = rpl_encap_lwtunnel(dst->lwtstate); return rpl_do_srh_inline(skb, rlwt, tinfo->srh); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); err = rpl_do_srh(skb, rlwt); if (unlikely(err)) goto drop; preempt_disable(); dst = dst_cache_get(&rlwt->cache); preempt_enable(); if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.daddr = hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; dst_release(dst); goto drop; } preempt_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); preempt_enable(); } skb_dst_drop(skb); skb_dst_set(skb, dst); err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto drop; return dst_output(net, sk, skb); drop: kfree_skb(skb); return err; } static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); err = rpl_do_srh(skb, rlwt); if (unlikely(err)) { kfree_skb(skb); return err; } preempt_disable(); dst = dst_cache_get(&rlwt->cache); preempt_enable(); if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); if (!dst->error) { preempt_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); preempt_enable(); } } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) return err; return dst_input(skb); } static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype, struct rpl_iptunnel_encap *tuninfo) { struct rpl_iptunnel_encap *data; struct nlattr *nla; int len; len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh); nla = nla_reserve(skb, attrtype, len); if (!nla) return -EMSGSIZE; data = nla_data(nla); memcpy(data, tuninfo->srh, len); return 0; } static int rpl_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo)) return -EMSGSIZE; return 0; } static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate) { struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh)); } static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a); struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b); int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh); if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh)) return 1; return memcmp(a_hdr, b_hdr, len); } static const struct lwtunnel_encap_ops rpl_ops = { .build_state = rpl_build_state, .destroy_state = rpl_destroy_state, .output = rpl_output, .input = rpl_input, .fill_encap = rpl_fill_encap_info, .get_encap_size = rpl_encap_nlsize, .cmp_encap = rpl_encap_cmp, .owner = THIS_MODULE, }; int __init rpl_init(void) { int err; err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); if (err) goto out; pr_info("RPL Segment Routing with IPv6\n"); return 0; out: return err; } void rpl_exit(void) { lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); } |
47 47 113 80 53 27 3 3 195 9 9 184 40 124 18 127 7 7 5 118 3 124 10 10 9 33 25 20 22 22 84 23 63 19 50 17 6 13 13 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> * Copyright (C) IBM Corporation, 2009 * Thanks to Thomas Gleixner for conceptual design and careful reviews. * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" */ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/plist.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they * reside in the same cacheline. */ static struct { struct futex_hash_bucket *queues; unsigned long hashsize; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) #define futex_hashsize (__futex_data.hashsize) /* * Fault injections for futexes. */ #ifdef CONFIG_FAIL_FUTEX static struct { struct fault_attr attr; bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, .ignore_private = false, }; static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); } __setup("fail_futex=", setup_fail_futex); bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private); return 0; } late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #endif /* CONFIG_FAIL_FUTEX */ /** * futex_hash - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket in the global hash. */ struct futex_hash_bucket *futex_hash(union futex_key *key) { u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); return &futex_queues[hash & (futex_hashsize - 1)]; } /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value * @timeout: the hrtimer_sleeper structure to be set up * @flags: futex flags * @range_ns: optional range in ns * * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns) { if (!time) return NULL; hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). */ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); return timeout; } /* * Generate a machine wide unique identifier for this inode. * * This relies on u64 not wrapping in the life-time of the machine; which with * 1ns resolution means almost 585 years. * * This further relies on the fact that a well formed program will not unmap * the file while it has a (shared) futex waiting on it. This mapping will have * a file reference which pins the mount and inode. * * If for some reason an inode gets evicted and read back in again, it will get * a new sequence number and will _NOT_ match, even though it is the exact same * file. * * It is important that futex_match() will never have a false-positive, esp. * for PI futexes that can mess up the state. The above argues that false-negatives * are only possible for malformed programs. */ static u64 get_inode_sequence_number(struct inode *inode) { static atomic64_t i_seq; u64 old; /* Does the inode already have a sequence number? */ old = atomic64_read(&inode->i_sequence); if (likely(old)) return old; for (;;) { u64 new = atomic64_add_return(1, &i_seq); if (WARN_ON_ONCE(!new)) continue; old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); if (old) return old; return new; } } /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: * * ( inode->i_sequence, page->index, offset_within_page ) * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page; struct folio *folio; struct address_space *mapping; int err, ro = 0; bool fshared; fshared = flags & FLAGS_SHARED; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(uaddr, sizeof(u32)))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { /* * On no-MMU, shared futexes are treated as private, therefore * we must not include the current process in the key. Since * there is only one address space, the address is a unique key * on its own. */ if (IS_ENABLED(CONFIG_MMU)) key->private.mm = mm; else key->private.mm = NULL; key->private.address = address; return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The folio * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the folio lock is not needed in all cases being * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and * folio lock will be acquired only if it is unavoidable * * Mapping checks require the folio so it is looked up now. For * anonymous pages, it does not matter if the folio is split * in the future as the key is based on the address. For * filesystem-backed pages, the precise page is required as the * index of the page determines the key. */ folio = page_folio(page); mapping = READ_ONCE(folio->mapping); /* * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Folio lock is required to identify which special case above * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ folio_lock(folio); shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; folio_unlock(folio); folio_put(folio); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the folio->mapping must be traversed. Ordinarily this should * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); folio_put(folio); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = folio->index + folio_page_idx(folio, page); rcu_read_unlock(); } out: folio_put(folio); return err; } /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */ int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; mmap_read_lock(mm); ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (futex_match(&this->key, key)) return this; } return NULL; } int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) { int ret; pagefault_disable(); ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); return ret; } int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; } /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. */ void wait_for_owner_exiting(int ret, struct task_struct *exiting) { if (ret != -EBUSY) { WARN_ON_ONCE(exiting); return; } if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; mutex_lock(&exiting->futex_exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can * have any FUTEX_STATE_* value when the waiter has acquired the * mutex. OK, if running, EXITING or DEAD if it reached exit() * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ mutex_unlock(&exiting->futex_exit_mutex); put_task_struct(exiting); } /** * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be NULL and must be held by the caller. */ void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); } /* The key must be already stored in q->key. */ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) __acquires(&hb->lock) { struct futex_hash_bucket *hb; hb = futex_hash(&q->key); /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is * waiting for the spinlock. This is safe as all futex_q_lock() * users end up calling futex_queue(). Similarly, for housekeeping, * decrement the counter at futex_q_unlock() when some error has * occurred and we don't end up adding the task to the list. */ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; spin_lock(&hb->lock); return hb; } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); futex_hb_waiters_dec(hb); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; /* * The priority used to register this element is * - either the real thread-priority for the real-time threads * (i.e. threads with a priority lower than MAX_RT_PRIO) * - or MAX_RT_PRIO for non-RT threads. * Thus, all RT-threads are woken first in priority order, and * the others are woken last, in FIFO order. */ prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; } /** * futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must * be paired with exactly one earlier call to futex_queue(). * * Return: * - 1 - if the futex_q was still queued (and we removed unqueued it); * - 0 - if the futex_q was already removed by the waking thread */ int futex_unqueue(struct futex_q *q) { spinlock_t *lock_ptr; int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: /* * q->lock_ptr can change between this read and the following spin_lock. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and * optimizing lock_ptr out of the logic below. */ lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. * * Reasoning goes like this: if we have the wrong lock, * q->lock_ptr must have changed (maybe several times) * between reading it and the spin_lock(). It can * change again after the spin_lock() but only if it was * already changed before the spin_lock(). It cannot, * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } __futex_unqueue(q); BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; } return ret; } /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { /* * If the lock was not acquired (due to timeout or signal) then the * rt_waiter is removed before futex_q is. If this is observed by * an unlocker after dropping the rtmutex wait lock and before * acquiring the hash bucket lock, then the unlocker dequeues the * futex_q from the hash bucket list to guarantee consistent state * vs. userspace. Therefore the dequeue here must be conditional. */ if (!plist_node_empty(&q->list)) __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; } /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; pid_t owner; int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) return -1; retry: if (get_user(uval, uaddr)) return -1; /* * Special case for regular (non PI) futexes. The unlock path in * user space has two race scenarios: * * 1. The unlock path releases the user space futex value and * before it can execute the futex() syscall to wake up * waiters it is killed. * * 2. A woken up waiter is killed before it can acquire the * futex in user space. * * In the second case, the wake up notification could be generated * by the unlock path in user space after setting the futex value * to zero or by the kernel after setting the OWNER_DIED bit below. * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. * * In both cases the following conditions are met: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and * trying to set the OWNER_DIED bit. If the futex value is zero, * the rest of the user space mutex state is consistent, so a woken * waiter will just take over the uncontended futex. Setting the * OWNER_DIED bit would create inconsistent state and malfunction * of the user space owner died handling. Otherwise, the OWNER_DIED * bit is already set, and the woken waiter is expected to deal with * this. */ owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); return 0; } if (owner != task_pid_vnr(curr)) return 0; /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically * via cmpxchg, and if the value had FUTEX_WAITERS * set, wake up a waiter (if any). (We have to do a * futex_wake() even if OWNER_DIED is already set - * to handle the rare but possible case of recursive * thread-death.) The rest of the cleanup is done in * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; /* * We are not holding a lock here, but we want to have * the pagefault_disable/enable() protection because * we want to handle the fault gracefully. If the * access fails we try to fault in the futex with R/W * verification via get_user_pages. get_user() above * does not guarantee R/W access. If that fails we * give up and leave the futex locked. */ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { switch (err) { case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; case -EAGAIN: cond_resched(); goto retry; default: WARN_ON_ONCE(1); return err; } } if (nval != uval) goto retry; /* * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, unsigned int *pi) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; unsigned long futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { handle_futex_death((void __user *)pending + futex_offset, curr, pip, HANDLE_DEATH_PENDING); } } #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); void __user *uaddr = compat_ptr(base + futex_offset); return uaddr; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; *entry = compat_ptr((*uentry) & ~1); *pi = (unsigned int)(*uentry) & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); if (handle_futex_death(uaddr, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } #endif #ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = futex_hash(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); } #else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); } /** * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD * @tsk: task to set the state on * * Set the futex exit state of the task lockless. The futex waiter code * observes that state when a task is exiting and loops until the task has * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can * take it over. If not, the problem is pushed back to user space. If the * futex exit code did not run yet, then an already queued waiter might * block forever, but there is nothing which can be done about that. */ void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ if (tsk->futex_state == FUTEX_STATE_EXITING) mutex_unlock(&tsk->futex_exit_mutex); tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ mutex_lock(&tsk->futex_exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * * This ensures that all subsequent checks of tsk->futex_state in * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with * tsk->pi_lock held. * * It guarantees also that a pi_state which was queued right before * the state change under tsk->pi_lock by a concurrent waiter must * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ tsk->futex_state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ futex_cleanup_begin(tsk); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about * exec a new binary. */ futex_cleanup_end(tsk, FUTEX_STATE_OK); } void futex_exit_release(struct task_struct *tsk) { futex_cleanup_begin(tsk); futex_cleanup(tsk); futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } static int __init futex_init(void) { unsigned int futex_shift; unsigned long i; #ifdef CONFIG_BASE_SMALL futex_hashsize = 16; #else futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), futex_hashsize, 0, 0, &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; for (i = 0; i < futex_hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } return 0; } core_initcall(futex_init); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 7 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_quota.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_error.h" #include "xfs_health.h" /* * Lock order: * * ip->i_lock * qi->qi_tree_lock * dquot->q_qlock (xfs_dqlock() and friends) * dquot->q_flush (xfs_dqflock() and friends) * qi->qi_lru_lock * * If two dquots need to be locked the order is user before group/project, * otherwise by the lowest id first, see xfs_dqlock2. */ struct kmem_cache *xfs_dqtrx_cache; static struct kmem_cache *xfs_dquot_cache; static struct lock_class_key xfs_dquot_group_class; static struct lock_class_key xfs_dquot_project_class; /* Record observations of quota corruption with the health tracking system. */ static void xfs_dquot_mark_sick( struct xfs_dquot *dqp) { struct xfs_mount *mp = dqp->q_mount; switch (dqp->q_type) { case XFS_DQTYPE_USER: xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA); break; case XFS_DQTYPE_GROUP: xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA); break; case XFS_DQTYPE_PROJ: xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA); break; default: ASSERT(0); break; } } /* * This is called to free all the memory associated with a dquot */ void xfs_qm_dqdestroy( struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); kvfree(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); kmem_cache_free(xfs_dquot_cache, dqp); } /* * If default limits are in force, push them into the dquot now. * We overwrite the dquot limits only if they are zero and this * is not the root dquot. */ void xfs_qm_adjust_dqlimits( struct xfs_dquot *dq) { struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; int prealloc = 0; ASSERT(dq->q_id); defq = xfs_get_defquota(q, xfs_dquot_type(dq)); if (!dq->q_blk.softlimit) { dq->q_blk.softlimit = defq->blk.soft; prealloc = 1; } if (!dq->q_blk.hardlimit) { dq->q_blk.hardlimit = defq->blk.hard; prealloc = 1; } if (!dq->q_ino.softlimit) dq->q_ino.softlimit = defq->ino.soft; if (!dq->q_ino.hardlimit) dq->q_ino.hardlimit = defq->ino.hard; if (!dq->q_rtb.softlimit) dq->q_rtb.softlimit = defq->rtb.soft; if (!dq->q_rtb.hardlimit) dq->q_rtb.hardlimit = defq->rtb.hard; if (prealloc) xfs_dquot_set_prealloc_limits(dq); } /* Set the expiration time of a quota's grace period. */ time64_t xfs_dquot_set_timeout( struct xfs_mount *mp, time64_t timeout) { struct xfs_quotainfo *qi = mp->m_quotainfo; return clamp_t(time64_t, timeout, qi->qi_expiry_min, qi->qi_expiry_max); } /* Set the length of the default grace period. */ time64_t xfs_dquot_set_grace_period( time64_t grace) { return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX); } /* * Determine if this quota counter is over either limit and set the quota * timers as appropriate. */ static inline void xfs_qm_adjust_res_timer( struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim) { ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit); if ((res->softlimit && res->count > res->softlimit) || (res->hardlimit && res->count > res->hardlimit)) { if (res->timer == 0) res->timer = xfs_dquot_set_timeout(mp, ktime_get_real_seconds() + qlim->time); } else { res->timer = 0; } } /* * Check the limits and timers of a dquot and start or reset timers * if necessary. * This gets called even when quota enforcement is OFF, which makes our * life a little less complicated. (We just don't reject any quota * reservations in that case, when enforcement is off). * We also return 0 as the values of the timers in Q_GETQUOTA calls, when * enforcement's off. * In contrast, warnings are a little different in that they don't * 'automatically' get started when limits get exceeded. They do * get reset to zero, however, when we find the count to be under * the soft limit (they are only ever set non-zero via userspace). */ void xfs_qm_adjust_dqtimers( struct xfs_dquot *dq) { struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; struct xfs_def_quota *defq; ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk); xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino); xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb); } /* * initialize a buffer full of dquots and log the whole thing */ void xfs_qm_init_dquot_blk( struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp) { struct xfs_mount *mp = tp->t_mountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dqblk *d; xfs_dqid_t curid; unsigned int qflag; unsigned int blftype; int i; ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); switch (type) { case XFS_DQTYPE_USER: qflag = XFS_UQUOTA_CHKD; blftype = XFS_BLF_UDQUOT_BUF; break; case XFS_DQTYPE_PROJ: qflag = XFS_PQUOTA_CHKD; blftype = XFS_BLF_PDQUOT_BUF; break; case XFS_DQTYPE_GROUP: qflag = XFS_GQUOTA_CHKD; blftype = XFS_BLF_GDQUOT_BUF; break; default: ASSERT(0); return; } d = bp->b_addr; /* * ID of the first dquot in the block - id's are zero based. */ curid = id - (id % q->qi_dqperchunk); memset(d, 0, BBTOB(q->qi_dqchunklen)); for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; if (curid > 0 && xfs_has_bigtime(mp)) d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; if (xfs_has_crc(mp)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } } xfs_trans_dquot_buf(tp, bp, blftype); /* * quotacheck uses delayed writes to update all the dquots on disk in an * efficient manner instead of logging the individual dquot changes as * they are made. However if we log the buffer allocated here and crash * after quotacheck while the logged initialisation is still in the * active region of the log, log recovery can replay the dquot buffer * initialisation over the top of the checked dquots and corrupt quota * accounting. * * To avoid this problem, quotacheck cannot log the initialised buffer. * We must still dirty the buffer and write it back before the * allocation transaction clears the log. Therefore, mark the buffer as * ordered instead of logging it directly. This is safe for quotacheck * because it detects and repairs allocated but initialized dquot blocks * in the quota inodes. */ if (!(mp->m_qflags & qflag)) xfs_trans_ordered_buf(tp, bp); else xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } /* * Initialize the dynamic speculative preallocation thresholds. The lo/hi * watermarks correspond to the soft and hard limits by default. If a soft limit * is not specified, we use 95% of the hard limit. */ void xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { uint64_t space; dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit; dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit; if (!dqp->q_prealloc_lo_wmark) { dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; do_div(dqp->q_prealloc_lo_wmark, 100); dqp->q_prealloc_lo_wmark *= 95; } space = dqp->q_prealloc_hi_wmark; do_div(space, 100); dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } /* * Ensure that the given in-core dquot has a buffer on disk backing it, and * return the buffer locked and held. This is called when the bmapi finds a * hole. */ STATIC int xfs_dquot_disk_alloc( struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; struct xfs_trans *tp; struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); int nmaps = 1; int error; trace_xfs_dqalloc(dqp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); if (error) return error; xfs_ilock(quotip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, quotip, 0); if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock */ error = -ESRCH; goto err_cancel; } error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, quotip, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto err_cancel; /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); if (error) goto err_cancel; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(nmaps == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); /* * Keep track of the blkno to save a lookup later */ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) goto err_cancel; bp->b_ops = &xfs_dquot_buf_ops; /* * Make a chunk of dquots out of this buffer and log * the entire thing. */ xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* * Hold the buffer and join it to the dfops so that we'll still own * the buffer when we return to the caller. The buffer disposal on * error must be paid attention to very carefully, as it has been * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota * code when allocating a new dquot record" in 2005, and the later * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep * the buffer locked across the _defer_finish call. We can now do * this correctly with xfs_defer_bjoin. * * Above, we allocated a disk block for the dquot information and used * get_buf to initialize the dquot. If the _defer_finish fails, the old * transaction is gone but the new buffer is not joined or held to any * transaction, so we must _buf_relse it. * * If everything succeeds, the caller of this function is returned a * buffer that is locked and held to the transaction. The caller * is responsible for unlocking any buffer passed back, either * manually or by committing the transaction. On error, the buffer is * released and not passed back. * * Keep the quota inode ILOCKed until after the transaction commit to * maintain the atomicity of bmap/rmap updates. */ xfs_trans_bhold(tp, bp); error = xfs_trans_commit(tp); xfs_iunlock(quotip, XFS_ILOCK_EXCL); if (error) { xfs_buf_relse(bp); return error; } *bpp = bp; return 0; err_cancel: xfs_trans_cancel(tp); xfs_iunlock(quotip, XFS_ILOCK_EXCL); return error; } /* * Read in the in-core dquot's on-disk metadata and return the buffer. * Returns ENOENT to signal a hole. */ STATIC int xfs_dquot_disk_read( struct xfs_mount *mp, struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); uint lock_mode; int nmaps = 1; int error; lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(mp, qtype)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ xfs_iunlock(quotip, lock_mode); return -ESRCH; } /* * Find the block map; no allocations yet */ error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, lock_mode); if (error) return error; ASSERT(nmaps == 1); ASSERT(map.br_blockcount >= 1); ASSERT(map.br_startblock != DELAYSTARTBLOCK); if (map.br_startblock == HOLESTARTBLOCK) return -ENOENT; trace_xfs_dqtobp_read(dqp); /* * store the blkno etc so that we don't have to do the * mapping all the time */ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); if (xfs_metadata_is_sick(error)) xfs_dquot_mark_sick(dqp); if (error) { ASSERT(bp == NULL); return error; } ASSERT(xfs_buf_islocked(bp)); xfs_buf_set_ref(bp, XFS_DQUOT_REF); *bpp = bp; return 0; } /* Allocate and initialize everything we need for an incore dquot. */ STATIC struct xfs_dquot * xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type) { struct xfs_dquot *dqp; dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL); dqp->q_type = type; dqp->q_id = id; dqp->q_mount = mp; INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; /* * Offset of dquot in the (fixed sized) dquot chunk. */ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * sizeof(struct xfs_dqblk); /* * Because we want to use a counting completion, complete * the flush completion once to allow a single access to * the flush completion without blocking. */ init_completion(&dqp->q_flush); complete(&dqp->q_flush); /* * Make sure group quotas have a different lock class than user * quotas. */ switch (type) { case XFS_DQTYPE_USER: /* uses the default lock class */ break; case XFS_DQTYPE_GROUP: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); break; case XFS_DQTYPE_PROJ: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); break; default: ASSERT(0); break; } xfs_qm_dquot_logitem_init(dqp); XFS_STATS_INC(mp, xs_qm_dquot); return dqp; } /* Check the ondisk dquot's id and type match what the incore dquot expects. */ static bool xfs_dquot_check_type( struct xfs_dquot *dqp, struct xfs_disk_dquot *ddqp) { uint8_t ddqp_type; uint8_t dqp_type; ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK; dqp_type = xfs_dquot_type(dqp); if (be32_to_cpu(ddqp->d_id) != dqp->q_id) return false; /* * V5 filesystems always expect an exact type match. V4 filesystems * expect an exact match for user dquots and for non-root group and * project dquots. */ if (xfs_has_crc(dqp->q_mount) || dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0) return ddqp_type == dqp_type; /* * V4 filesystems support either group or project quotas, but not both * at the same time. The non-user quota file can be switched between * group and project quota uses depending on the mount options, which * means that we can encounter the other type when we try to load quota * defaults. Quotacheck will soon reset the entire quota file * (including the root dquot) anyway, but don't log scary corruption * reports to dmesg. */ return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ; } /* Copy the in-core quota fields in from the on-disk buffer. */ STATIC int xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset); struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq; /* * Ensure that we got the type and ID we were looking for. * Everything else was checked by the dquot buffer verifier. */ if (!xfs_dquot_check_type(dqp, ddqp)) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", __this_address, dqp->q_id); xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); xfs_dquot_mark_sick(dqp); return -EFSCORRUPTED; } /* copy everything from disk dquot to the incore dquot */ dqp->q_type = ddqp->d_type; dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit); dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit); dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount); dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. */ dqp->q_blk.reserved = dqp->q_blk.count; dqp->q_ino.reserved = dqp->q_ino.count; dqp->q_rtb.reserved = dqp->q_rtb.count; /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); return 0; } /* Copy the in-core quota fields into the on-disk buffer. */ void xfs_dquot_to_disk( struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp) { ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); ddqp->d_version = XFS_DQUOT_VERSION; ddqp->d_type = dqp->q_type; ddqp->d_id = cpu_to_be32(dqp->q_id); ddqp->d_pad0 = 0; ddqp->d_pad = 0; ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit); ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit); ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit); ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit); ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit); ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit); ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count); ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); ddqp->d_bwarns = 0; ddqp->d_iwarns = 0; ddqp->d_rtbwarns = 0; ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } /* * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. If @can_alloc is true, fill any * holes in the on-disk metadata. */ static int xfs_qm_dqread( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **dqpp) { struct xfs_dquot *dqp; struct xfs_buf *bp; int error; dqp = xfs_dquot_alloc(mp, id, type); trace_xfs_dqread(dqp); /* Try to read the buffer, allocating if necessary. */ error = xfs_dquot_disk_read(mp, dqp, &bp); if (error == -ENOENT && can_alloc) error = xfs_dquot_disk_alloc(dqp, &bp); if (error) goto err; /* * At this point we should have a clean locked buffer. Copy the data * to the incore dquot and release the buffer since the incore dquot * has its own locking protocol so we needn't tie up the buffer any * further. */ ASSERT(xfs_buf_islocked(bp)); error = xfs_dquot_from_disk(dqp, bp); xfs_buf_relse(bp); if (error) goto err; *dqpp = dqp; return error; err: trace_xfs_dqread_fail(dqp); xfs_qm_dqdestroy(dqp); *dqpp = NULL; return error; } /* * Advance to the next id in the current chunk, or if at the * end of the chunk, skip ahead to first id in next allocated chunk * using the SEEK_DATA interface. */ static int xfs_dq_get_next_id( struct xfs_mount *mp, xfs_dqtype_t type, xfs_dqid_t *id) { struct xfs_inode *quotip = xfs_quota_inode(mp, type); xfs_dqid_t next_id = *id + 1; /* simple advance */ uint lock_flags; struct xfs_bmbt_irec got; struct xfs_iext_cursor cur; xfs_fsblock_t start; int error = 0; /* If we'd wrap past the max ID, stop */ if (next_id < *id) return -ENOENT; /* If new ID is within the current chunk, advancing it sufficed */ if (next_id % mp->m_quotainfo->qi_dqperchunk) { *id = next_id; return 0; } /* Nope, next_id is now past the current chunk, so find the next one */ start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk; lock_flags = xfs_ilock_data_map_shared(quotip); error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK); if (error) return error; if (xfs_iext_lookup_extent(quotip, "ip->i_df, start, &cur, &got)) { /* contiguous chunk, bump startoff for the id calculation */ if (got.br_startoff < start) got.br_startoff = start; *id = got.br_startoff * mp->m_quotainfo->qi_dqperchunk; } else { error = -ENOENT; } xfs_iunlock(quotip, lock_flags); return error; } /* * Look up the dquot in the in-core cache. If found, the dquot is returned * locked and ready to go. */ static struct xfs_dquot * xfs_qm_dqget_cache_lookup( struct xfs_mount *mp, struct xfs_quotainfo *qi, struct radix_tree_root *tree, xfs_dqid_t id) { struct xfs_dquot *dqp; restart: mutex_lock(&qi->qi_tree_lock); dqp = radix_tree_lookup(tree, id); if (!dqp) { mutex_unlock(&qi->qi_tree_lock); XFS_STATS_INC(mp, xs_qm_dqcachemisses); return NULL; } xfs_dqlock(dqp); if (dqp->q_flags & XFS_DQFLAG_FREEING) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); delay(1); goto restart; } dqp->q_nrefs++; mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); XFS_STATS_INC(mp, xs_qm_dqcachehits); return dqp; } /* * Try to insert a new dquot into the in-core cache. If an error occurs the * caller should throw away the dquot and start over. Otherwise, the dquot * is returned locked (and held by the cache) as if there had been a cache * hit. * * The insert needs to be done under memalloc_nofs context because the radix * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in * memory reclaim when freeing unused dquots, so we cannot have the radix tree * node allocation recursing into filesystem reclaim whilst we hold the * qi_tree_lock. */ static int xfs_qm_dqget_cache_insert( struct xfs_mount *mp, struct xfs_quotainfo *qi, struct radix_tree_root *tree, xfs_dqid_t id, struct xfs_dquot *dqp) { unsigned int nofs_flags; int error; nofs_flags = memalloc_nofs_save(); mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ trace_xfs_dqget_dup(dqp); goto out_unlock; } /* Return a locked dquot to the caller, with a reference taken. */ xfs_dqlock(dqp); dqp->q_nrefs = 1; qi->qi_dquots++; out_unlock: mutex_unlock(&qi->qi_tree_lock); memalloc_nofs_restore(nofs_flags); return error; } /* Check our input parameters. */ static int xfs_qm_dqget_checks( struct xfs_mount *mp, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) return -ESRCH; return 0; case XFS_DQTYPE_GROUP: if (!XFS_IS_GQUOTA_ON(mp)) return -ESRCH; return 0; case XFS_DQTYPE_PROJ: if (!XFS_IS_PQUOTA_ON(mp)) return -ESRCH; return 0; default: WARN_ON_ONCE(0); return -EINVAL; } } /* * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a * locked dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; error = xfs_qm_dqget_checks(mp, type); if (error) return error; restart: dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); if (dqp) { *O_dqpp = dqp; return 0; } error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); if (error) return error; error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); if (error) { /* * Duplicate found. Just throw away the new dquot and start * over. */ xfs_qm_dqdestroy(dqp); XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return 0; } /* * Given a dquot id and type, read and initialize a dquot from the on-disk * metadata. This function is only for use during quota initialization so * it ignores the dquot cache assuming that the dquot shrinker isn't set up. * The caller is responsible for _qm_dqdestroy'ing the returned dquot. */ int xfs_qm_dqget_uncached( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp) { int error; error = xfs_qm_dqget_checks(mp, type); if (error) return error; return xfs_qm_dqread(mp, id, type, 0, dqpp); } /* Return the quota id for a given inode and type. */ xfs_dqid_t xfs_qm_id_for_quotatype( struct xfs_inode *ip, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: return i_uid_read(VFS_I(ip)); case XFS_DQTYPE_GROUP: return i_gid_read(VFS_I(ip)); case XFS_DQTYPE_PROJ: return ip->i_projid; } ASSERT(0); return 0; } /* * Return the dquot for a given inode and type. If @can_alloc is true, then * allocate blocks if needed. The inode's ILOCK must be held and it must not * have already had an inode attached. */ int xfs_qm_dqget_inode( struct xfs_inode *ip, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { struct xfs_mount *mp = ip->i_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; xfs_dqid_t id; int error; error = xfs_qm_dqget_checks(mp, type); if (error) return error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_dquot(ip, type) == NULL); id = xfs_qm_id_for_quotatype(ip, type); restart: dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); if (dqp) { *O_dqpp = dqp; return 0; } /* * Dquot cache miss. We don't want to keep the inode lock across * a (potential) disk read. Also we don't want to deal with the lock * ordering between quotainode and this inode. OTOH, dropping the inode * lock here means dealing with a chown that can happen before * we re-acquire the lock. */ xfs_iunlock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) return error; /* * A dquot could be attached to this inode by now, since we had * dropped the ilock. */ if (xfs_this_quota_on(mp, type)) { struct xfs_dquot *dqp1; dqp1 = xfs_inode_dquot(ip, type); if (dqp1) { xfs_qm_dqdestroy(dqp); dqp = dqp1; xfs_dqlock(dqp); goto dqret; } } else { /* inode stays locked on return */ xfs_qm_dqdestroy(dqp); return -ESRCH; } error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); if (error) { /* * Duplicate found. Just throw away the new dquot and start * over. */ xfs_qm_dqdestroy(dqp); XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } dqret: xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return 0; } /* * Starting at @id and progressing upwards, look for an initialized incore * dquot, lock it, and return it. */ int xfs_qm_dqget_next( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp) { struct xfs_dquot *dqp; int error = 0; *dqpp = NULL; for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) { error = xfs_qm_dqget(mp, id, type, false, &dqp); if (error == -ENOENT) continue; else if (error != 0) break; if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { *dqpp = dqp; return 0; } xfs_qm_dqput(dqp); } return error; } /* * Release a reference to the dquot (decrement ref-count) and unlock it. * * If there is a group quota attached to this dquot, carefully release that * too without tripping over deadlocks'n'stuff. */ void xfs_qm_dqput( struct xfs_dquot *dqp) { ASSERT(dqp->q_nrefs > 0); ASSERT(XFS_DQ_IS_LOCKED(dqp)); trace_xfs_dqput(dqp); if (--dqp->q_nrefs == 0) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); } /* * Release a dquot. Flush it if dirty, then dqput() it. * dquot must not be locked. */ void xfs_qm_dqrele( struct xfs_dquot *dqp) { if (!dqp) return; trace_xfs_dqrele(dqp); xfs_dqlock(dqp); /* * We don't care to flush it if the dquot is dirty here. * That will create stutters that we want to avoid. * Instead we do a delayed write when we try to reclaim * a dirty dquot. Also xfs_sync will take part of the burden... */ xfs_qm_dqput(dqp); } /* * This is the dquot flushing I/O completion routine. It is called * from interrupt level when the buffer containing the dquot is * flushed to disk. It is responsible for removing the dquot logitem * from the AIL if it has not been re-logged, and unlocking the dquot's * flush lock. This behavior is very similar to that of inodes.. */ static void xfs_qm_dqflush_done( struct xfs_log_item *lip) { struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; struct xfs_dquot *dqp = qip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; xfs_lsn_t tail_lsn; /* * We only want to pull the item from the AIL if its * location in the log has not changed since we started the flush. * Thus, we only bother if the dquot's lsn has * not changed. First we check the lsn outside the lock * since it's cheaper, and then we recheck while * holding the lock before removing the dquot from the AIL. */ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && ((lip->li_lsn == qip->qli_flush_lsn) || test_bit(XFS_LI_FAILED, &lip->li_flags))) { spin_lock(&ailp->ail_lock); xfs_clear_li_failed(lip); if (lip->li_lsn == qip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } else { spin_unlock(&ailp->ail_lock); } } /* * Release the dq's flush lock since we're done with it. */ xfs_dqfunlock(dqp); } void xfs_buf_dquot_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { list_del_init(&lip->li_bio_list); xfs_qm_dqflush_done(lip); } } void xfs_buf_dquot_io_fail( struct xfs_buf *bp) { struct xfs_log_item *lip; spin_lock(&bp->b_mount->m_ail->ail_lock); list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_set_li_failed(lip, bp); spin_unlock(&bp->b_mount->m_ail->ail_lock); } /* Check incore dquot for errors before we flush. */ static xfs_failaddr_t xfs_qm_dqflush_check( struct xfs_dquot *dqp) { xfs_dqtype_t type = xfs_dquot_type(dqp); if (type != XFS_DQTYPE_USER && type != XFS_DQTYPE_GROUP && type != XFS_DQTYPE_PROJ) return __this_address; if (dqp->q_id == 0) return NULL; if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit && !dqp->q_blk.timer) return __this_address; if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit && !dqp->q_ino.timer) return __this_address; if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit && !dqp->q_rtb.timer) return __this_address; /* bigtime flag should never be set on root dquots */ if (dqp->q_type & XFS_DQTYPE_BIGTIME) { if (!xfs_has_bigtime(dqp->q_mount)) return __this_address; if (dqp->q_id == 0) return __this_address; } return NULL; } /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, * but the dquot is free to be unlocked and modified by the caller * in the interim. Dquot is still locked on return. This behavior is * identical to that of inodes. */ int xfs_qm_dqflush( struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_log_item *lip = &dqp->q_logitem.qli_item; struct xfs_buf *bp; struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); trace_xfs_dqflush(dqp); *bpp = NULL; xfs_qm_dqunpin_wait(dqp); /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, &bp, &xfs_dquot_buf_ops); if (error == -EAGAIN) goto out_unlock; if (xfs_metadata_is_sick(error)) xfs_dquot_mark_sick(dqp); if (error) goto out_abort; fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", dqp->q_id, fa); xfs_buf_relse(bp); xfs_dquot_mark_sick(dqp); error = -EFSCORRUPTED; goto out_abort; } /* Flush the incore dquot to the ondisk buffer. */ dqblk = xfs_buf_offset(bp, dqp->q_bufoffset); xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* * Clear the dirty field and remember the flush lsn for later use. */ dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, &dqp->q_logitem.qli_item.li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. * * We also calculate the CRC here so that the on-disk dquot in the * buffer always has a valid CRC. This ensures there is no possibility * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_has_crc(mp)) { dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } /* * Attach the dquot to the buffer so that we can remove this dquot from * the AIL and release the flush lock once the dquot is synced to disk. */ bp->b_flags |= _XBF_DQUOTS; list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } trace_xfs_dqflush_done(dqp); *bpp = bp; return 0; out_abort: dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); out_unlock: xfs_dqfunlock(dqp); return error; } /* * Lock two xfs_dquot structures. * * To avoid deadlocks we always lock the quota structure with * the lowerd id first. */ void xfs_dqlock2( struct xfs_dquot *d1, struct xfs_dquot *d2) { if (d1 && d2) { ASSERT(d1 != d2); if (d1->q_id > d2->q_id) { mutex_lock(&d2->q_qlock); mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); } else { mutex_lock(&d1->q_qlock); mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); } } else if (d1) { mutex_lock(&d1->q_qlock); } else if (d2) { mutex_lock(&d2->q_qlock); } } int __init xfs_qm_init(void) { xfs_dquot_cache = kmem_cache_create("xfs_dquot", sizeof(struct xfs_dquot), 0, 0, NULL); if (!xfs_dquot_cache) goto out; xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx", sizeof(struct xfs_dquot_acct), 0, 0, NULL); if (!xfs_dqtrx_cache) goto out_free_dquot_cache; return 0; out_free_dquot_cache: kmem_cache_destroy(xfs_dquot_cache); out: return -ENOMEM; } void xfs_qm_exit(void) { kmem_cache_destroy(xfs_dqtrx_cache); kmem_cache_destroy(xfs_dquot_cache); } |
43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel Kernel API * * This file defines the kernel API for the NetLabel system. The NetLabel * system manages static and dynamic label mappings for network protocols such * as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/audit.h> #include <linux/in.h> #include <linux/in6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <net/calipso.h> #include <asm/bug.h> #include <linux/atomic.h> #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" #include "netlabel_calipso.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" /* * Configuration Functions */ /** * netlbl_cfg_map_del - Remove a NetLabel/LSM domain mapping * @domain: the domain mapping to remove * @family: address family * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Removes a NetLabel/LSM domain mapping. A @domain value of NULL causes the * default domain mapping to be removed. Returns zero on success, negative * values on failure. * */ int netlbl_cfg_map_del(const char *domain, u16 family, const void *addr, const void *mask, struct netlbl_audit *audit_info) { if (addr == NULL && mask == NULL) { return netlbl_domhsh_remove(domain, family, audit_info); } else if (addr != NULL && mask != NULL) { switch (family) { case AF_INET: return netlbl_domhsh_remove_af4(domain, addr, mask, audit_info); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return netlbl_domhsh_remove_af6(domain, addr, mask, audit_info); #endif /* IPv6 */ default: return -EPFNOSUPPORT; } } else return -EINVAL; } /** * netlbl_cfg_unlbl_map_add - Add a new unlabeled mapping * @domain: the domain mapping to add * @family: address family * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Adds a new unlabeled NetLabel/LSM domain mapping. A @domain value of NULL * causes a new default domain mapping to be added. Returns zero on success, * negative values on failure. * */ int netlbl_cfg_unlbl_map_add(const char *domain, u16 family, const void *addr, const void *mask, struct netlbl_audit *audit_info) { int ret_val = -ENOMEM; struct netlbl_dom_map *entry; struct netlbl_domaddr_map *addrmap = NULL; struct netlbl_domaddr4_map *map4 = NULL; struct netlbl_domaddr6_map *map6 = NULL; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) return -ENOMEM; if (domain != NULL) { entry->domain = kstrdup(domain, GFP_ATOMIC); if (entry->domain == NULL) goto cfg_unlbl_map_add_failure; } entry->family = family; if (addr == NULL && mask == NULL) entry->def.type = NETLBL_NLTYPE_UNLABELED; else if (addr != NULL && mask != NULL) { addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); if (addrmap == NULL) goto cfg_unlbl_map_add_failure; INIT_LIST_HEAD(&addrmap->list4); INIT_LIST_HEAD(&addrmap->list6); switch (family) { case AF_INET: { const struct in_addr *addr4 = addr; const struct in_addr *mask4 = mask; map4 = kzalloc(sizeof(*map4), GFP_ATOMIC); if (map4 == NULL) goto cfg_unlbl_map_add_failure; map4->def.type = NETLBL_NLTYPE_UNLABELED; map4->list.addr = addr4->s_addr & mask4->s_addr; map4->list.mask = mask4->s_addr; map4->list.valid = 1; ret_val = netlbl_af4list_add(&map4->list, &addrmap->list4); if (ret_val != 0) goto cfg_unlbl_map_add_failure; break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { const struct in6_addr *addr6 = addr; const struct in6_addr *mask6 = mask; map6 = kzalloc(sizeof(*map6), GFP_ATOMIC); if (map6 == NULL) goto cfg_unlbl_map_add_failure; map6->def.type = NETLBL_NLTYPE_UNLABELED; map6->list.addr = *addr6; map6->list.addr.s6_addr32[0] &= mask6->s6_addr32[0]; map6->list.addr.s6_addr32[1] &= mask6->s6_addr32[1]; map6->list.addr.s6_addr32[2] &= mask6->s6_addr32[2]; map6->list.addr.s6_addr32[3] &= mask6->s6_addr32[3]; map6->list.mask = *mask6; map6->list.valid = 1; ret_val = netlbl_af6list_add(&map6->list, &addrmap->list6); if (ret_val != 0) goto cfg_unlbl_map_add_failure; break; } #endif /* IPv6 */ default: goto cfg_unlbl_map_add_failure; } entry->def.addrsel = addrmap; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; } else { ret_val = -EINVAL; goto cfg_unlbl_map_add_failure; } ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) goto cfg_unlbl_map_add_failure; return 0; cfg_unlbl_map_add_failure: kfree(entry->domain); kfree(entry); kfree(addrmap); kfree(map4); kfree(map6); return ret_val; } /** * netlbl_cfg_unlbl_static_add - Adds a new static label * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order (struct in[6]_addr) * @mask: address mask in network byte order (struct in[6]_addr) * @family: address family * @secid: LSM secid value for the entry * @audit_info: NetLabel audit information * * Description: * Adds a new NetLabel static label to be used when protocol provided labels * are not present on incoming traffic. If @dev_name is NULL then the default * interface will be used. Returns zero on success, negative values on failure. * */ int netlbl_cfg_unlbl_static_add(struct net *net, const char *dev_name, const void *addr, const void *mask, u16 family, u32 secid, struct netlbl_audit *audit_info) { u32 addr_len; switch (family) { case AF_INET: addr_len = sizeof(struct in_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: addr_len = sizeof(struct in6_addr); break; #endif /* IPv6 */ default: return -EPFNOSUPPORT; } return netlbl_unlhsh_add(net, dev_name, addr, mask, addr_len, secid, audit_info); } /** * netlbl_cfg_unlbl_static_del - Removes an existing static label * @net: network namespace * @dev_name: interface name * @addr: IP address in network byte order (struct in[6]_addr) * @mask: address mask in network byte order (struct in[6]_addr) * @family: address family * @audit_info: NetLabel audit information * * Description: * Removes an existing NetLabel static label used when protocol provided labels * are not present on incoming traffic. If @dev_name is NULL then the default * interface will be used. Returns zero on success, negative values on failure. * */ int netlbl_cfg_unlbl_static_del(struct net *net, const char *dev_name, const void *addr, const void *mask, u16 family, struct netlbl_audit *audit_info) { u32 addr_len; switch (family) { case AF_INET: addr_len = sizeof(struct in_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: addr_len = sizeof(struct in6_addr); break; #endif /* IPv6 */ default: return -EPFNOSUPPORT; } return netlbl_unlhsh_remove(net, dev_name, addr, mask, addr_len, audit_info); } /** * netlbl_cfg_cipsov4_add - Add a new CIPSOv4 DOI definition * @doi_def: CIPSO DOI definition * @audit_info: NetLabel audit information * * Description: * Add a new CIPSO DOI definition as defined by @doi_def. Returns zero on * success and negative values on failure. * */ int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info) { return cipso_v4_doi_add(doi_def, audit_info); } /** * netlbl_cfg_cipsov4_del - Remove an existing CIPSOv4 DOI definition * @doi: CIPSO DOI * @audit_info: NetLabel audit information * * Description: * Remove an existing CIPSO DOI definition matching @doi. Returns zero on * success and negative values on failure. * */ void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info) { cipso_v4_doi_remove(doi, audit_info); } /** * netlbl_cfg_cipsov4_map_add - Add a new CIPSOv4 DOI mapping * @doi: the CIPSO DOI * @domain: the domain mapping to add * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Add a new NetLabel/LSM domain mapping for the given CIPSO DOI to the NetLabel * subsystem. A @domain value of NULL adds a new default domain mapping. * Returns zero on success, negative values on failure. * */ int netlbl_cfg_cipsov4_map_add(u32 doi, const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { int ret_val = -ENOMEM; struct cipso_v4_doi *doi_def; struct netlbl_dom_map *entry; struct netlbl_domaddr_map *addrmap = NULL; struct netlbl_domaddr4_map *addrinfo = NULL; doi_def = cipso_v4_doi_getdef(doi); if (doi_def == NULL) return -ENOENT; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) goto out_entry; entry->family = AF_INET; if (domain != NULL) { entry->domain = kstrdup(domain, GFP_ATOMIC); if (entry->domain == NULL) goto out_domain; } if (addr == NULL && mask == NULL) { entry->def.cipso = doi_def; entry->def.type = NETLBL_NLTYPE_CIPSOV4; } else if (addr != NULL && mask != NULL) { addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); if (addrmap == NULL) goto out_addrmap; INIT_LIST_HEAD(&addrmap->list4); INIT_LIST_HEAD(&addrmap->list6); addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); if (addrinfo == NULL) goto out_addrinfo; addrinfo->def.cipso = doi_def; addrinfo->def.type = NETLBL_NLTYPE_CIPSOV4; addrinfo->list.addr = addr->s_addr & mask->s_addr; addrinfo->list.mask = mask->s_addr; addrinfo->list.valid = 1; ret_val = netlbl_af4list_add(&addrinfo->list, &addrmap->list4); if (ret_val != 0) goto cfg_cipsov4_map_add_failure; entry->def.addrsel = addrmap; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; } else { ret_val = -EINVAL; goto out_addrmap; } ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) goto cfg_cipsov4_map_add_failure; return 0; cfg_cipsov4_map_add_failure: kfree(addrinfo); out_addrinfo: kfree(addrmap); out_addrmap: kfree(entry->domain); out_domain: kfree(entry); out_entry: cipso_v4_doi_putdef(doi_def); return ret_val; } /** * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition * @doi_def: CALIPSO DOI definition * @audit_info: NetLabel audit information * * Description: * Add a new CALIPSO DOI definition as defined by @doi_def. Returns zero on * success and negative values on failure. * */ int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { #if IS_ENABLED(CONFIG_IPV6) return calipso_doi_add(doi_def, audit_info); #else /* IPv6 */ return -ENOSYS; #endif /* IPv6 */ } /** * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition * @doi: CALIPSO DOI * @audit_info: NetLabel audit information * * Description: * Remove an existing CALIPSO DOI definition matching @doi. Returns zero on * success and negative values on failure. * */ void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info) { #if IS_ENABLED(CONFIG_IPV6) calipso_doi_remove(doi, audit_info); #endif /* IPv6 */ } /** * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping * @doi: the CALIPSO DOI * @domain: the domain mapping to add * @addr: IP address * @mask: IP address mask * @audit_info: NetLabel audit information * * Description: * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the * NetLabel subsystem. A @domain value of NULL adds a new default domain * mapping. Returns zero on success, negative values on failure. * */ int netlbl_cfg_calipso_map_add(u32 doi, const char *domain, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { #if IS_ENABLED(CONFIG_IPV6) int ret_val = -ENOMEM; struct calipso_doi *doi_def; struct netlbl_dom_map *entry; struct netlbl_domaddr_map *addrmap = NULL; struct netlbl_domaddr6_map *addrinfo = NULL; doi_def = calipso_doi_getdef(doi); if (doi_def == NULL) return -ENOENT; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) goto out_entry; entry->family = AF_INET6; if (domain != NULL) { entry->domain = kstrdup(domain, GFP_ATOMIC); if (entry->domain == NULL) goto out_domain; } if (addr == NULL && mask == NULL) { entry->def.calipso = doi_def; entry->def.type = NETLBL_NLTYPE_CALIPSO; } else if (addr != NULL && mask != NULL) { addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); if (addrmap == NULL) goto out_addrmap; INIT_LIST_HEAD(&addrmap->list4); INIT_LIST_HEAD(&addrmap->list6); addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); if (addrinfo == NULL) goto out_addrinfo; addrinfo->def.calipso = doi_def; addrinfo->def.type = NETLBL_NLTYPE_CALIPSO; addrinfo->list.addr = *addr; addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; addrinfo->list.mask = *mask; addrinfo->list.valid = 1; ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6); if (ret_val != 0) goto cfg_calipso_map_add_failure; entry->def.addrsel = addrmap; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; } else { ret_val = -EINVAL; goto out_addrmap; } ret_val = netlbl_domhsh_add(entry, audit_info); if (ret_val != 0) goto cfg_calipso_map_add_failure; return 0; cfg_calipso_map_add_failure: kfree(addrinfo); out_addrinfo: kfree(addrmap); out_addrmap: kfree(entry->domain); out_domain: kfree(entry); out_entry: calipso_doi_putdef(doi_def); return ret_val; #else /* IPv6 */ return -ENOSYS; #endif /* IPv6 */ } /* * Security Attribute Functions */ #define _CM_F_NONE 0x00000000 #define _CM_F_ALLOC 0x00000001 #define _CM_F_WALK 0x00000002 /** * _netlbl_catmap_getnode - Get a individual node from a catmap * @catmap: pointer to the category bitmap * @offset: the requested offset * @cm_flags: catmap flags, see _CM_F_* * @gfp_flags: memory allocation flags * * Description: * Iterate through the catmap looking for the node associated with @offset. * If the _CM_F_ALLOC flag is set in @cm_flags and there is no associated node, * one will be created and inserted into the catmap. If the _CM_F_WALK flag is * set in @cm_flags and there is no associated node, the next highest node will * be returned. Returns a pointer to the node on success, NULL on failure. * */ static struct netlbl_lsm_catmap *_netlbl_catmap_getnode( struct netlbl_lsm_catmap **catmap, u32 offset, unsigned int cm_flags, gfp_t gfp_flags) { struct netlbl_lsm_catmap *iter = *catmap; struct netlbl_lsm_catmap *prev = NULL; if (iter == NULL) goto catmap_getnode_alloc; if (offset < iter->startbit) goto catmap_getnode_walk; while (iter && offset >= (iter->startbit + NETLBL_CATMAP_SIZE)) { prev = iter; iter = iter->next; } if (iter == NULL || offset < iter->startbit) goto catmap_getnode_walk; return iter; catmap_getnode_walk: if (cm_flags & _CM_F_WALK) return iter; catmap_getnode_alloc: if (!(cm_flags & _CM_F_ALLOC)) return NULL; iter = netlbl_catmap_alloc(gfp_flags); if (iter == NULL) return NULL; iter->startbit = offset & ~(NETLBL_CATMAP_SIZE - 1); if (prev == NULL) { iter->next = *catmap; *catmap = iter; } else { iter->next = prev->next; prev->next = iter; } return iter; } /** * netlbl_catmap_walk - Walk a LSM secattr catmap looking for a bit * @catmap: the category bitmap * @offset: the offset to start searching at, in bits * * Description: * This function walks a LSM secattr category bitmap starting at @offset and * returns the spot of the first set bit or -ENOENT if no bits are set. * */ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) { struct netlbl_lsm_catmap *iter; u32 idx; u32 bit; u64 bitmap; iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0); if (iter == NULL) return -ENOENT; if (offset > iter->startbit) { offset -= iter->startbit; idx = offset / NETLBL_CATMAP_MAPSIZE; bit = offset % NETLBL_CATMAP_MAPSIZE; } else { idx = 0; bit = 0; } bitmap = iter->bitmap[idx] >> bit; for (;;) { if (bitmap != 0) { while ((bitmap & NETLBL_CATMAP_BIT) == 0) { bitmap >>= 1; bit++; } return iter->startbit + (NETLBL_CATMAP_MAPSIZE * idx) + bit; } if (++idx >= NETLBL_CATMAP_MAPCNT) { if (iter->next != NULL) { iter = iter->next; idx = 0; } else return -ENOENT; } bitmap = iter->bitmap[idx]; bit = 0; } return -ENOENT; } EXPORT_SYMBOL(netlbl_catmap_walk); /** * netlbl_catmap_walkrng - Find the end of a string of set bits * @catmap: the category bitmap * @offset: the offset to start searching at, in bits * * Description: * This function walks a LSM secattr category bitmap starting at @offset and * returns the spot of the first cleared bit or -ENOENT if the offset is past * the end of the bitmap. * */ int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset) { struct netlbl_lsm_catmap *iter; struct netlbl_lsm_catmap *prev = NULL; u32 idx; u32 bit; u64 bitmask; u64 bitmap; iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0); if (iter == NULL) return -ENOENT; if (offset > iter->startbit) { offset -= iter->startbit; idx = offset / NETLBL_CATMAP_MAPSIZE; bit = offset % NETLBL_CATMAP_MAPSIZE; } else { idx = 0; bit = 0; } bitmask = NETLBL_CATMAP_BIT << bit; for (;;) { bitmap = iter->bitmap[idx]; while (bitmask != 0 && (bitmap & bitmask) != 0) { bitmask <<= 1; bit++; } if (prev && idx == 0 && bit == 0) return prev->startbit + NETLBL_CATMAP_SIZE - 1; else if (bitmask != 0) return iter->startbit + (NETLBL_CATMAP_MAPSIZE * idx) + bit - 1; else if (++idx >= NETLBL_CATMAP_MAPCNT) { if (iter->next == NULL) return iter->startbit + NETLBL_CATMAP_SIZE - 1; prev = iter; iter = iter->next; idx = 0; } bitmask = NETLBL_CATMAP_BIT; bit = 0; } return -ENOENT; } /** * netlbl_catmap_getlong - Export an unsigned long bitmap * @catmap: pointer to the category bitmap * @offset: pointer to the requested offset * @bitmap: the exported bitmap * * Description: * Export a bitmap with an offset greater than or equal to @offset and return * it in @bitmap. The @offset must be aligned to an unsigned long and will be * updated on return if different from what was requested; if the catmap is * empty at the requested offset and beyond, the @offset is set to (u32)-1. * Returns zero on success, negative values on failure. * */ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, u32 *offset, unsigned long *bitmap) { struct netlbl_lsm_catmap *iter; u32 off = *offset; u32 idx; /* only allow aligned offsets */ if ((off & (BITS_PER_LONG - 1)) != 0) return -EINVAL; /* a null catmap is equivalent to an empty one */ if (!catmap) { *offset = (u32)-1; return 0; } if (off < catmap->startbit) { off = catmap->startbit; *offset = off; } iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_WALK, 0); if (iter == NULL) { *offset = (u32)-1; return 0; } if (off < iter->startbit) { *offset = iter->startbit; off = 0; } else off -= iter->startbit; idx = off / NETLBL_CATMAP_MAPSIZE; *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_MAPSIZE); return 0; } /** * netlbl_catmap_setbit - Set a bit in a LSM secattr catmap * @catmap: pointer to the category bitmap * @bit: the bit to set * @flags: memory allocation flags * * Description: * Set the bit specified by @bit in @catmap. Returns zero on success, * negative values on failure. * */ int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap, u32 bit, gfp_t flags) { struct netlbl_lsm_catmap *iter; u32 idx; iter = _netlbl_catmap_getnode(catmap, bit, _CM_F_ALLOC, flags); if (iter == NULL) return -ENOMEM; bit -= iter->startbit; idx = bit / NETLBL_CATMAP_MAPSIZE; iter->bitmap[idx] |= NETLBL_CATMAP_BIT << (bit % NETLBL_CATMAP_MAPSIZE); return 0; } EXPORT_SYMBOL(netlbl_catmap_setbit); /** * netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap * @catmap: pointer to the category bitmap * @start: the starting bit * @end: the last bit in the string * @flags: memory allocation flags * * Description: * Set a range of bits, starting at @start and ending with @end. Returns zero * on success, negative values on failure. * */ int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap, u32 start, u32 end, gfp_t flags) { int rc = 0; u32 spot = start; while (rc == 0 && spot <= end) { if (((spot & (BITS_PER_LONG - 1)) == 0) && ((end - spot) > BITS_PER_LONG)) { rc = netlbl_catmap_setlong(catmap, spot, (unsigned long)-1, flags); spot += BITS_PER_LONG; } else rc = netlbl_catmap_setbit(catmap, spot++, flags); } return rc; } /** * netlbl_catmap_setlong - Import an unsigned long bitmap * @catmap: pointer to the category bitmap * @offset: offset to the start of the imported bitmap * @bitmap: the bitmap to import * @flags: memory allocation flags * * Description: * Import the bitmap specified in @bitmap into @catmap, using the offset * in @offset. The offset must be aligned to an unsigned long. Returns zero * on success, negative values on failure. * */ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, u32 offset, unsigned long bitmap, gfp_t flags) { struct netlbl_lsm_catmap *iter; u32 idx; /* only allow aligned offsets */ if ((offset & (BITS_PER_LONG - 1)) != 0) return -EINVAL; iter = _netlbl_catmap_getnode(catmap, offset, _CM_F_ALLOC, flags); if (iter == NULL) return -ENOMEM; offset -= iter->startbit; idx = offset / NETLBL_CATMAP_MAPSIZE; iter->bitmap[idx] |= (u64)bitmap << (offset % NETLBL_CATMAP_MAPSIZE); return 0; } /* Bitmap functions */ /** * netlbl_bitmap_walk - Walk a bitmap looking for a bit * @bitmap: the bitmap * @bitmap_len: length in bits * @offset: starting offset * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit * * Description: * Starting at @offset, walk the bitmap from left to right until either the * desired bit is found or we reach the end. Return the bit offset, -1 if * not found. */ int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, u32 offset, u8 state) { u32 bit_spot; u32 byte_offset; unsigned char bitmask; unsigned char byte; if (offset >= bitmap_len) return -1; byte_offset = offset / 8; byte = bitmap[byte_offset]; bit_spot = offset; bitmask = 0x80 >> (offset % 8); while (bit_spot < bitmap_len) { if ((state && (byte & bitmask) == bitmask) || (state == 0 && (byte & bitmask) == 0)) return bit_spot; if (++bit_spot >= bitmap_len) return -1; bitmask >>= 1; if (bitmask == 0) { byte = bitmap[++byte_offset]; bitmask = 0x80; } } return -1; } EXPORT_SYMBOL(netlbl_bitmap_walk); /** * netlbl_bitmap_setbit - Sets a single bit in a bitmap * @bitmap: the bitmap * @bit: the bit * @state: if non-zero, set the bit (1) else clear the bit (0) * * Description: * Set a single bit in the bitmask. Returns zero on success, negative values * on error. */ void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state) { u32 byte_spot; u8 bitmask; /* gcc always rounds to zero when doing integer division */ byte_spot = bit / 8; bitmask = 0x80 >> (bit % 8); if (state) bitmap[byte_spot] |= bitmask; else bitmap[byte_spot] &= ~bitmask; } EXPORT_SYMBOL(netlbl_bitmap_setbit); /* * LSM Functions */ /** * netlbl_enabled - Determine if the NetLabel subsystem is enabled * * Description: * The LSM can use this function to determine if it should use NetLabel * security attributes in it's enforcement mechanism. Currently, NetLabel is * considered to be enabled when it's configuration contains a valid setup for * at least one labeled protocol (i.e. NetLabel can understand incoming * labeled packets of at least one type); otherwise NetLabel is considered to * be disabled. * */ int netlbl_enabled(void) { /* At some point we probably want to expose this mechanism to the user * as well so that admins can toggle NetLabel regardless of the * configuration */ return (atomic_read(&netlabel_mgmt_protocount) > 0); } /** * netlbl_sock_setattr - Label a socket using the correct protocol * @sk: the socket to label * @family: protocol family * @secattr: the security attributes * @sk_locked: true if caller holds the socket lock * * Description: * Attach the correct label to the given socket using the security attributes * specified in @secattr. This function requires exclusive access to @sk, * which means it either needs to be in the process of being created or locked. * Returns zero on success, -EDESTADDRREQ if the domain is configured to use * network address selectors (can't blindly label the socket), and negative * values on all other failures. * */ int netlbl_sock_setattr(struct sock *sk, u16 family, const struct netlbl_lsm_secattr *secattr, bool sk_locked) { int ret_val; struct netlbl_dom_map *dom_entry; rcu_read_lock(); dom_entry = netlbl_domhsh_getentry(secattr->domain, family); if (dom_entry == NULL) { ret_val = -ENOENT; goto socket_setattr_return; } switch (family) { case AF_INET: switch (dom_entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: ret_val = -EDESTADDRREQ; break; case NETLBL_NLTYPE_CIPSOV4: ret_val = cipso_v4_sock_setattr(sk, dom_entry->def.cipso, secattr, sk_locked); break; case NETLBL_NLTYPE_UNLABELED: ret_val = 0; break; default: ret_val = -ENOENT; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: switch (dom_entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: ret_val = -EDESTADDRREQ; break; case NETLBL_NLTYPE_CALIPSO: ret_val = calipso_sock_setattr(sk, dom_entry->def.calipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: ret_val = 0; break; default: ret_val = -ENOENT; } break; #endif /* IPv6 */ default: ret_val = -EPROTONOSUPPORT; } socket_setattr_return: rcu_read_unlock(); return ret_val; } /** * netlbl_sock_delattr - Delete all the NetLabel labels on a socket * @sk: the socket * * Description: * Remove all the NetLabel labeling from @sk. The caller is responsible for * ensuring that @sk is locked. * */ void netlbl_sock_delattr(struct sock *sk) { switch (sk->sk_family) { case AF_INET: cipso_v4_sock_delattr(sk); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: calipso_sock_delattr(sk); break; #endif /* IPv6 */ } } /** * netlbl_sock_getattr - Determine the security attributes of a sock * @sk: the sock * @secattr: the security attributes * * Description: * Examines the given sock to see if any NetLabel style labeling has been * applied to the sock, if so it parses the socket label and returns the * security attributes in @secattr. Returns zero on success, negative values * on failure. * */ int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { int ret_val; switch (sk->sk_family) { case AF_INET: ret_val = cipso_v4_sock_getattr(sk, secattr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: ret_val = calipso_sock_getattr(sk, secattr); break; #endif /* IPv6 */ default: ret_val = -EPROTONOSUPPORT; } return ret_val; } /** * netlbl_sk_lock_check - Check if the socket lock has been acquired. * @sk: the socket to be checked * * Return: true if socket @sk is locked or if lock debugging is disabled at * runtime or compile-time; false otherwise * */ #ifdef CONFIG_LOCKDEP bool netlbl_sk_lock_check(struct sock *sk) { if (debug_locks) return lockdep_sock_is_held(sk); return true; } #else bool netlbl_sk_lock_check(struct sock *sk) { return true; } #endif /** * netlbl_conn_setattr - Label a connected socket using the correct protocol * @sk: the socket to label * @addr: the destination address * @secattr: the security attributes * * Description: * Attach the correct label to the given connected socket using the security * attributes specified in @secattr. The caller is responsible for ensuring * that @sk is locked. Returns zero on success, negative values on failure. * */ int netlbl_conn_setattr(struct sock *sk, struct sockaddr *addr, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct sockaddr_in *addr4; #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *addr6; #endif struct netlbl_dommap_def *entry; rcu_read_lock(); switch (addr->sa_family) { case AF_INET: addr4 = (struct sockaddr_in *)addr; entry = netlbl_domhsh_getentry_af4(secattr->domain, addr4->sin_addr.s_addr); if (entry == NULL) { ret_val = -ENOENT; goto conn_setattr_return; } switch (entry->type) { case NETLBL_NLTYPE_CIPSOV4: ret_val = cipso_v4_sock_setattr(sk, entry->cipso, secattr, netlbl_sk_lock_check(sk)); break; case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now * but we could remove other protocols if needed */ netlbl_sock_delattr(sk); ret_val = 0; break; default: ret_val = -ENOENT; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: addr6 = (struct sockaddr_in6 *)addr; entry = netlbl_domhsh_getentry_af6(secattr->domain, &addr6->sin6_addr); if (entry == NULL) { ret_val = -ENOENT; goto conn_setattr_return; } switch (entry->type) { case NETLBL_NLTYPE_CALIPSO: ret_val = calipso_sock_setattr(sk, entry->calipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now * but we could remove other protocols if needed */ netlbl_sock_delattr(sk); ret_val = 0; break; default: ret_val = -ENOENT; } break; #endif /* IPv6 */ default: ret_val = -EPROTONOSUPPORT; } conn_setattr_return: rcu_read_unlock(); return ret_val; } /** * netlbl_req_setattr - Label a request socket using the correct protocol * @req: the request socket to label * @secattr: the security attributes * * Description: * Attach the correct label to the given socket using the security attributes * specified in @secattr. Returns zero on success, negative values on failure. * */ int netlbl_req_setattr(struct request_sock *req, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct netlbl_dommap_def *entry; struct inet_request_sock *ireq = inet_rsk(req); rcu_read_lock(); switch (req->rsk_ops->family) { case AF_INET: entry = netlbl_domhsh_getentry_af4(secattr->domain, ireq->ir_rmt_addr); if (entry == NULL) { ret_val = -ENOENT; goto req_setattr_return; } switch (entry->type) { case NETLBL_NLTYPE_CIPSOV4: ret_val = cipso_v4_req_setattr(req, entry->cipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: netlbl_req_delattr(req); ret_val = 0; break; default: ret_val = -ENOENT; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: entry = netlbl_domhsh_getentry_af6(secattr->domain, &ireq->ir_v6_rmt_addr); if (entry == NULL) { ret_val = -ENOENT; goto req_setattr_return; } switch (entry->type) { case NETLBL_NLTYPE_CALIPSO: ret_val = calipso_req_setattr(req, entry->calipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: netlbl_req_delattr(req); ret_val = 0; break; default: ret_val = -ENOENT; } break; #endif /* IPv6 */ default: ret_val = -EPROTONOSUPPORT; } req_setattr_return: rcu_read_unlock(); return ret_val; } /** * netlbl_req_delattr - Delete all the NetLabel labels on a socket * @req: the socket * * Description: * Remove all the NetLabel labeling from @req. * */ void netlbl_req_delattr(struct request_sock *req) { switch (req->rsk_ops->family) { case AF_INET: cipso_v4_req_delattr(req); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: calipso_req_delattr(req); break; #endif /* IPv6 */ } } /** * netlbl_skbuff_setattr - Label a packet using the correct protocol * @skb: the packet * @family: protocol family * @secattr: the security attributes * * Description: * Attach the correct label to the given packet using the security attributes * specified in @secattr. Returns zero on success, negative values on failure. * */ int netlbl_skbuff_setattr(struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct iphdr *hdr4; #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *hdr6; #endif struct netlbl_dommap_def *entry; rcu_read_lock(); switch (family) { case AF_INET: hdr4 = ip_hdr(skb); entry = netlbl_domhsh_getentry_af4(secattr->domain, hdr4->daddr); if (entry == NULL) { ret_val = -ENOENT; goto skbuff_setattr_return; } switch (entry->type) { case NETLBL_NLTYPE_CIPSOV4: ret_val = cipso_v4_skbuff_setattr(skb, entry->cipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now * but we could remove other protocols if needed */ ret_val = cipso_v4_skbuff_delattr(skb); break; default: ret_val = -ENOENT; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: hdr6 = ipv6_hdr(skb); entry = netlbl_domhsh_getentry_af6(secattr->domain, &hdr6->daddr); if (entry == NULL) { ret_val = -ENOENT; goto skbuff_setattr_return; } switch (entry->type) { case NETLBL_NLTYPE_CALIPSO: ret_val = calipso_skbuff_setattr(skb, entry->calipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now * but we could remove other protocols if needed */ ret_val = calipso_skbuff_delattr(skb); break; default: ret_val = -ENOENT; } break; #endif /* IPv6 */ default: ret_val = -EPROTONOSUPPORT; } skbuff_setattr_return: rcu_read_unlock(); return ret_val; } /** * netlbl_skbuff_getattr - Determine the security attributes of a packet * @skb: the packet * @family: protocol family * @secattr: the security attributes * * Description: * Examines the given packet to see if a recognized form of packet labeling * is present, if so it parses the packet label and returns the security * attributes in @secattr. Returns zero on success, negative values on * failure. * */ int netlbl_skbuff_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr) { unsigned char *ptr; switch (family) { case AF_INET: ptr = cipso_v4_optptr(skb); if (ptr && cipso_v4_getattr(ptr, secattr) == 0) return 0; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: ptr = calipso_optptr(skb); if (ptr && calipso_getattr(ptr, secattr) == 0) return 0; break; #endif /* IPv6 */ } return netlbl_unlabel_getattr(skb, family, secattr); } /** * netlbl_skbuff_err - Handle a LSM error on a sk_buff * @skb: the packet * @family: the family * @error: the error code * @gateway: true if host is acting as a gateway, false otherwise * * Description: * Deal with a LSM problem when handling the packet in @skb, typically this is * a permission denied problem (-EACCES). The correct action is determined * according to the packet's labeling protocol. * */ void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway) { switch (family) { case AF_INET: if (cipso_v4_optptr(skb)) cipso_v4_error(skb, error, gateway); break; } } /** * netlbl_cache_invalidate - Invalidate all of the NetLabel protocol caches * * Description: * For all of the NetLabel protocols that support some form of label mapping * cache, invalidate the cache. Returns zero on success, negative values on * error. * */ void netlbl_cache_invalidate(void) { cipso_v4_cache_invalidate(); #if IS_ENABLED(CONFIG_IPV6) calipso_cache_invalidate(); #endif /* IPv6 */ } /** * netlbl_cache_add - Add an entry to a NetLabel protocol cache * @skb: the packet * @family: the family * @secattr: the packet's security attributes * * Description: * Add the LSM security attributes for the given packet to the underlying * NetLabel protocol's label mapping cache. Returns zero on success, negative * values on error. * */ int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { unsigned char *ptr; if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0) return -ENOMSG; switch (family) { case AF_INET: ptr = cipso_v4_optptr(skb); if (ptr) return cipso_v4_cache_add(ptr, secattr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: ptr = calipso_optptr(skb); if (ptr) return calipso_cache_add(ptr, secattr); break; #endif /* IPv6 */ } return -ENOMSG; } /* * Protocol Engine Functions */ /** * netlbl_audit_start - Start an audit message * @type: audit message type * @audit_info: NetLabel audit information * * Description: * Start an audit message using the type specified in @type and fill the audit * message with some fields common to all NetLabel audit messages. This * function should only be used by protocol engines, not LSMs. Returns a * pointer to the audit buffer on success, NULL on failure. * */ struct audit_buffer *netlbl_audit_start(int type, struct netlbl_audit *audit_info) { return netlbl_audit_start_common(type, audit_info); } EXPORT_SYMBOL(netlbl_audit_start); /* * Setup Functions */ /** * netlbl_init - Initialize NetLabel * * Description: * Perform the required NetLabel initialization before first use. * */ static int __init netlbl_init(void) { int ret_val; printk(KERN_INFO "NetLabel: Initializing\n"); printk(KERN_INFO "NetLabel: domain hash size = %u\n", (1 << NETLBL_DOMHSH_BITSIZE)); printk(KERN_INFO "NetLabel: protocols = UNLABELED CIPSOv4 CALIPSO\n"); ret_val = netlbl_domhsh_init(NETLBL_DOMHSH_BITSIZE); if (ret_val != 0) goto init_failure; ret_val = netlbl_unlabel_init(NETLBL_UNLHSH_BITSIZE); if (ret_val != 0) goto init_failure; ret_val = netlbl_netlink_init(); if (ret_val != 0) goto init_failure; ret_val = netlbl_unlabel_defconf(); if (ret_val != 0) goto init_failure; printk(KERN_INFO "NetLabel: unlabeled traffic allowed by default\n"); return 0; init_failure: panic("NetLabel: failed to initialize properly (%d)\n", ret_val); } subsys_initcall(netlbl_init); |
16 16 16 6 6 3 6 6 6 6 6 6 6 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * * Based on net/ipv4/xfrm4_tunnel.c */ #include <linux/module.h> #include <linux/xfrm.h> #include <linux/slab.h> #include <linux/rculist.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <net/netns/generic.h> #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 #define XFRM6_TUNNEL_SPI_MIN 1 #define XFRM6_TUNNEL_SPI_MAX 0xffffffff struct xfrm6_tunnel_net { struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; u32 spi; }; static unsigned int xfrm6_tunnel_net_id __read_mostly; static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) { return net_generic(net, xfrm6_tunnel_net_id); } /* * xfrm_tunnel_spi things are for allocating unique id ("spi") * per xfrm_address_t. */ struct xfrm6_tunnel_spi { struct hlist_node list_byaddr; struct hlist_node list_byspi; xfrm_address_t addr; u32 spi; refcount_t refcnt; struct rcu_head rcu_head; }; static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) { unsigned int h; h = ipv6_addr_hash((const struct in6_addr *)addr); h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; return h; } static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi) { return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; } static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; hlist_for_each_entry_rcu(x6spi, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } return NULL; } __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; rcu_read_lock_bh(); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); spi = x6spi ? x6spi->spi : 0; rcu_read_unlock_bh(); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; int index = xfrm6_tunnel_spi_hash_byspi(spi); hlist_for_each_entry(x6spi, &xfrm6_tn->spi_byspi[index], list_byspi) { if (x6spi->spi == spi) return -1; } return index; } static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); u32 spi; struct xfrm6_tunnel_spi *x6spi; int index; if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; else xfrm6_tn->spi++; for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; if (spi == XFRM6_TUNNEL_SPI_MAX) break; } for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; } spi = 0; goto out; alloc_spi: xfrm6_tn->spi = spi; x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); if (!x6spi) goto out; memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); x6spi->spi = spi; refcount_set(&x6spi->refcnt, 1); hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); index = xfrm6_tunnel_spi_hash_byaddr(saddr); hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); out: return spi; } __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; spin_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); if (x6spi) { refcount_inc(&x6spi->refcnt); spi = x6spi->spi; } else spi = __xfrm6_tunnel_alloc_spi(net, saddr); spin_unlock_bh(&xfrm6_tunnel_spi_lock); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); static void x6spi_destroy_rcu(struct rcu_head *head) { kmem_cache_free(xfrm6_tunnel_spi_kmem, container_of(head, struct xfrm6_tunnel_spi, rcu_head)); } static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; struct hlist_node *n; spin_lock_bh(&xfrm6_tunnel_spi_lock); hlist_for_each_entry_safe(x6spi, n, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) { if (refcount_dec_and_test(&x6spi->refcnt)) { hlist_del_rcu(&x6spi->list_byaddr); hlist_del_rcu(&x6spi->list_byspi); call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu); break; } } } spin_unlock_bh(&xfrm6_tunnel_spi_lock); } static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { return skb_network_header(skb)[IP6CB(skb)->nhoff]; } static int xfrm6_tunnel_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = ipv6_hdr(skb); __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* xfrm6_tunnel native err handling */ switch (type) { case ICMPV6_DEST_UNREACH: switch (code) { case ICMPV6_NOROUTE: case ICMPV6_ADM_PROHIBITED: case ICMPV6_NOT_NEIGHBOUR: case ICMPV6_ADDR_UNREACH: case ICMPV6_PORT_UNREACH: default: break; } break; case ICMPV6_PKT_TOOBIG: break; case ICMPV6_TIME_EXCEED: switch (code) { case ICMPV6_EXC_HOPLIMIT: break; case ICMPV6_EXC_FRAGTIME: default: break; } break; case ICMPV6_PARAMPROB: switch (code) { case ICMPV6_HDR_FIELD: break; case ICMPV6_UNK_NEXTHDR: break; case ICMPV6_UNK_OPTION: break; } break; default: break; } return 0; } static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->props.mode != XFRM_MODE_TUNNEL) { NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode"); return -EINVAL; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation"); return -EINVAL; } x->props.header_len = sizeof(struct ipv6hdr); return 0; } static void xfrm6_tunnel_destroy(struct xfrm_state *x) { struct net *net = xs_net(x); xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); } static const struct xfrm_type xfrm6_tunnel_type = { .owner = THIS_MODULE, .proto = IPPROTO_IPV6, .init_state = xfrm6_tunnel_init_state, .destructor = xfrm6_tunnel_destroy, .input = xfrm6_tunnel_input, .output = xfrm6_tunnel_output, }; static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static int __net_init xfrm6_tunnel_net_init(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); xfrm6_tn->spi = 0; return 0; } static void __net_exit xfrm6_tunnel_net_exit(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; xfrm_flush_gc(); xfrm_state_flush(net, 0, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i])); } static struct pernet_operations xfrm6_tunnel_net_ops = { .init = xfrm6_tunnel_net_init, .exit = xfrm6_tunnel_net_exit, .id = &xfrm6_tunnel_net_id, .size = sizeof(struct xfrm6_tunnel_net), }; static int __init xfrm6_tunnel_init(void) { int rv; xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN); if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); if (rv < 0) goto out_pernet; rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); if (rv < 0) goto out_type; rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); if (rv < 0) goto out_xfrm6; rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); if (rv < 0) goto out_xfrm46; return 0; out_xfrm46: xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); out_xfrm6: xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); out_type: unregister_pernet_subsys(&xfrm6_tunnel_net_ops); out_pernet: kmem_cache_destroy(xfrm6_tunnel_spi_kmem); return rv; } static void __exit xfrm6_tunnel_fini(void) { xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); unregister_pernet_subsys(&xfrm6_tunnel_net_ops); /* Someone maybe has gotten the xfrm6_tunnel_spi. * So need to wait it. */ rcu_barrier(); kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } module_init(xfrm6_tunnel_init); module_exit(xfrm6_tunnel_fini); MODULE_DESCRIPTION("IPv6 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6); |
9 3 8 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 | /* * linux/fs/nls/mac-roman.c * * Charset macroman translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* 0xb0 */ 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* 0xc0 */ 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, /* 0xe0 */ 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* 0xf0 */ 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page25[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char pagef8[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ }; static const unsigned char pagefb[256] = { 0x00, 0xde, 0xdf, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, page25, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, pagef8, NULL, NULL, pagefb, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macroman", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macroman(void) { return register_nls(&table); } static void __exit exit_nls_macroman(void) { unregister_nls(&table); } module_init(init_nls_macroman) module_exit(exit_nls_macroman) MODULE_LICENSE("Dual BSD/GPL"); |
1262 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KCOV_H #define _LINUX_KCOV_H #include <linux/sched.h> #include <uapi/linux/kcov.h> struct task_struct; #ifdef CONFIG_KCOV enum kcov_mode { /* Coverage collection is not enabled yet. */ KCOV_MODE_DISABLED = 0, /* KCOV was initialized, but tracing mode hasn't been chosen yet. */ KCOV_MODE_INIT = 1, /* * Tracing coverage collection mode. * Covered PCs are collected in a per-task buffer. */ KCOV_MODE_TRACE_PC = 2, /* Collecting comparison operands mode. */ KCOV_MODE_TRACE_CMP = 3, }; #define KCOV_IN_CTXSW (1 << 30) void kcov_task_init(struct task_struct *t); void kcov_task_exit(struct task_struct *t); #define kcov_prepare_switch(t) \ do { \ (t)->kcov_mode |= KCOV_IN_CTXSW; \ } while (0) #define kcov_finish_switch(t) \ do { \ (t)->kcov_mode &= ~KCOV_IN_CTXSW; \ } while (0) /* See Documentation/dev-tools/kcov.rst for usage details. */ void kcov_remote_start(u64 handle); void kcov_remote_stop(void); u64 kcov_common_handle(void); static inline void kcov_remote_start_common(u64 id) { kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_COMMON, id)); } static inline void kcov_remote_start_usb(u64 id) { kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_USB, id)); } /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary * work around for kcov's lack of nested remote coverage sections support in * task context. Adding support for nested sections is tracked in: * https://bugzilla.kernel.org/show_bug.cgi?id=210337 */ static inline void kcov_remote_start_usb_softirq(u64 id) { if (in_serving_softirq()) kcov_remote_start_usb(id); } static inline void kcov_remote_stop_softirq(void) { if (in_serving_softirq()) kcov_remote_stop(); } #ifdef CONFIG_64BIT typedef unsigned long kcov_u64; #else typedef unsigned long long kcov_u64; #endif void __sanitizer_cov_trace_pc(void); void __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2); void __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2); void __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2); void __sanitizer_cov_trace_cmp8(kcov_u64 arg1, kcov_u64 arg2); void __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2); void __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2); void __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2); void __sanitizer_cov_trace_const_cmp8(kcov_u64 arg1, kcov_u64 arg2); void __sanitizer_cov_trace_switch(kcov_u64 val, void *cases); #else static inline void kcov_task_init(struct task_struct *t) {} static inline void kcov_task_exit(struct task_struct *t) {} static inline void kcov_prepare_switch(struct task_struct *t) {} static inline void kcov_finish_switch(struct task_struct *t) {} static inline void kcov_remote_start(u64 handle) {} static inline void kcov_remote_stop(void) {} static inline u64 kcov_common_handle(void) { return 0; } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} static inline void kcov_remote_start_usb_softirq(u64 id) {} static inline void kcov_remote_stop_softirq(void) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ |
1721 1312 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM signal #if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SIGNAL_H #include <linux/signal.h> #include <linux/sched.h> #include <linux/tracepoint.h> #define TP_STORE_SIGINFO(__entry, info) \ do { \ if (info == SEND_SIG_NOINFO) { \ __entry->errno = 0; \ __entry->code = SI_USER; \ } else if (info == SEND_SIG_PRIV) { \ __entry->errno = 0; \ __entry->code = SI_KERNEL; \ } else { \ __entry->errno = info->si_errno; \ __entry->code = info->si_code; \ } \ } while (0) #ifndef TRACE_HEADER_MULTI_READ enum { TRACE_SIGNAL_DELIVERED, TRACE_SIGNAL_IGNORED, TRACE_SIGNAL_ALREADY_PENDING, TRACE_SIGNAL_OVERFLOW_FAIL, TRACE_SIGNAL_LOSE_INFO, }; #endif /** * signal_generate - called when a signal is generated * @sig: signal number * @info: pointer to struct siginfo * @task: pointer to struct task_struct * @group: shared or private * @result: TRACE_SIGNAL_* * * Current process sends a 'sig' signal to 'task' process with * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, * 'info' is not a pointer and you can't access its field. Instead, * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV * means that si_code is SI_KERNEL. */ TRACE_EVENT(signal_generate, TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task, int group, int result), TP_ARGS(sig, info, task, group, result), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, group ) __field( int, result ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->pid = task->pid; __entry->group = group; __entry->result = result; ), TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d", __entry->sig, __entry->errno, __entry->code, __entry->comm, __entry->pid, __entry->group, __entry->result) ); /** * signal_deliver - called when a signal is delivered * @sig: signal number * @info: pointer to struct siginfo * @ka: pointer to struct k_sigaction * * A 'sig' signal is delivered to current process with 'info' siginfo, * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or * SIG_DFL. * Note that some signals reported by signal_generate tracepoint can be * lost, ignored or modified (by debugger) before hitting this tracepoint. * This means, this can show which signals are actually delivered, but * matching generated signals and delivered signals may not be correct. */ TRACE_EVENT(signal_deliver, TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka), TP_ARGS(sig, info, ka), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __field( unsigned long, sa_handler ) __field( unsigned long, sa_flags ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); __entry->sa_handler = (unsigned long)ka->sa.sa_handler; __entry->sa_flags = ka->sa.sa_flags; ), TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx", __entry->sig, __entry->errno, __entry->code, __entry->sa_handler, __entry->sa_flags) ); #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
4 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "statx.h" struct io_statx { struct file *file; int dfd; unsigned int mask; unsigned int flags; struct filename *filename; struct statx __user *buffer; }; int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); const char __user *path; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; sx->dfd = READ_ONCE(sqe->fd); sx->mask = READ_ONCE(sqe->len); path = u64_to_user_ptr(READ_ONCE(sqe->addr)); sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sx->flags = READ_ONCE(sqe->statx_flags); sx->filename = getname_flags(path, getname_statx_lookup_flags(sx->flags), NULL); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); sx->filename = NULL; return ret; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_statx(struct io_kiocb *req, unsigned int issue_flags) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); return IOU_OK; } void io_statx_cleanup(struct io_kiocb *req) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); if (sx->filename) putname(sx->filename); } |
1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | // SPDX-License-Identifier: GPL-2.0 /* usb-urb.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file keeps functions for initializing and handling the * BULK and ISOC USB data transfers in a generic way. * Can be used for DVB-only and also, that's the plan, for * Hybrid USB devices (analog and DVB). */ #include "dvb-usb-common.h" /* URB stuff for streaming */ static void usb_urb_complete(struct urb *urb) { struct usb_data_stream *stream = urb->context; int ptype = usb_pipetype(urb->pipe); int i; u8 *b; deb_uxfer("'%s' urb completed. status: %d, length: %d/%d, pack_num: %d, errors: %d\n", ptype == PIPE_ISOCHRONOUS ? "isoc" : "bulk", urb->status,urb->actual_length,urb->transfer_buffer_length, urb->number_of_packets,urb->error_count); switch (urb->status) { case 0: /* success */ case -ETIMEDOUT: /* NAK */ break; case -ECONNRESET: /* kill */ case -ENOENT: case -ESHUTDOWN: return; default: /* error */ deb_ts("urb completion error %d.\n", urb->status); break; } b = (u8 *) urb->transfer_buffer; switch (ptype) { case PIPE_ISOCHRONOUS: for (i = 0; i < urb->number_of_packets; i++) { if (urb->iso_frame_desc[i].status != 0) deb_ts("iso frame descriptor has an error: %d\n",urb->iso_frame_desc[i].status); else if (urb->iso_frame_desc[i].actual_length > 0) stream->complete(stream, b + urb->iso_frame_desc[i].offset, urb->iso_frame_desc[i].actual_length); urb->iso_frame_desc[i].status = 0; urb->iso_frame_desc[i].actual_length = 0; } debug_dump(b,20,deb_uxfer); break; case PIPE_BULK: if (urb->actual_length > 0) stream->complete(stream, b, urb->actual_length); break; default: err("unknown endpoint type in completion handler."); return; } usb_submit_urb(urb,GFP_ATOMIC); } int usb_urb_kill(struct usb_data_stream *stream) { int i; for (i = 0; i < stream->urbs_submitted; i++) { deb_ts("killing URB no. %d.\n",i); /* stop the URB */ usb_kill_urb(stream->urb_list[i]); } stream->urbs_submitted = 0; return 0; } int usb_urb_submit(struct usb_data_stream *stream) { int i,ret; for (i = 0; i < stream->urbs_initialized; i++) { deb_ts("submitting URB no. %d\n",i); if ((ret = usb_submit_urb(stream->urb_list[i],GFP_ATOMIC))) { err("could not submit URB no. %d - get them all back",i); usb_urb_kill(stream); return ret; } stream->urbs_submitted++; } return 0; } static int usb_free_stream_buffers(struct usb_data_stream *stream) { if (stream->state & USB_STATE_URB_BUF) { while (stream->buf_num) { stream->buf_num--; deb_mem("freeing buffer %d\n",stream->buf_num); usb_free_coherent(stream->udev, stream->buf_size, stream->buf_list[stream->buf_num], stream->dma_addr[stream->buf_num]); } } stream->state &= ~USB_STATE_URB_BUF; return 0; } static int usb_allocate_stream_buffers(struct usb_data_stream *stream, int num, unsigned long size) { stream->buf_num = 0; stream->buf_size = size; deb_mem("all in all I will use %lu bytes for streaming\n",num*size); for (stream->buf_num = 0; stream->buf_num < num; stream->buf_num++) { deb_mem("allocating buffer %d\n",stream->buf_num); if (( stream->buf_list[stream->buf_num] = usb_alloc_coherent(stream->udev, size, GFP_KERNEL, &stream->dma_addr[stream->buf_num]) ) == NULL) { deb_mem("not enough memory for urb-buffer allocation.\n"); usb_free_stream_buffers(stream); return -ENOMEM; } deb_mem("buffer %d: %p (dma: %Lu)\n", stream->buf_num, stream->buf_list[stream->buf_num], (long long)stream->dma_addr[stream->buf_num]); memset(stream->buf_list[stream->buf_num],0,size); stream->state |= USB_STATE_URB_BUF; } deb_mem("allocation successful\n"); return 0; } static int usb_bulk_urb_init(struct usb_data_stream *stream) { int i, j; if ((i = usb_allocate_stream_buffers(stream,stream->props.count, stream->props.u.bulk.buffersize)) < 0) return i; /* allocate the URBs */ for (i = 0; i < stream->props.count; i++) { stream->urb_list[i] = usb_alloc_urb(0, GFP_KERNEL); if (!stream->urb_list[i]) { deb_mem("not enough memory for urb_alloc_urb!.\n"); for (j = 0; j < i; j++) usb_free_urb(stream->urb_list[j]); return -ENOMEM; } usb_fill_bulk_urb( stream->urb_list[i], stream->udev, usb_rcvbulkpipe(stream->udev,stream->props.endpoint), stream->buf_list[i], stream->props.u.bulk.buffersize, usb_urb_complete, stream); stream->urb_list[i]->transfer_flags = URB_NO_TRANSFER_DMA_MAP; stream->urb_list[i]->transfer_dma = stream->dma_addr[i]; stream->urbs_initialized++; } return 0; } static int usb_isoc_urb_init(struct usb_data_stream *stream) { int i,j; if ((i = usb_allocate_stream_buffers(stream,stream->props.count, stream->props.u.isoc.framesize*stream->props.u.isoc.framesperurb)) < 0) return i; /* allocate the URBs */ for (i = 0; i < stream->props.count; i++) { struct urb *urb; int frame_offset = 0; stream->urb_list[i] = usb_alloc_urb(stream->props.u.isoc.framesperurb, GFP_KERNEL); if (!stream->urb_list[i]) { deb_mem("not enough memory for urb_alloc_urb!\n"); for (j = 0; j < i; j++) usb_free_urb(stream->urb_list[j]); return -ENOMEM; } urb = stream->urb_list[i]; urb->dev = stream->udev; urb->context = stream; urb->complete = usb_urb_complete; urb->pipe = usb_rcvisocpipe(stream->udev,stream->props.endpoint); urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; urb->interval = stream->props.u.isoc.interval; urb->number_of_packets = stream->props.u.isoc.framesperurb; urb->transfer_buffer_length = stream->buf_size; urb->transfer_buffer = stream->buf_list[i]; urb->transfer_dma = stream->dma_addr[i]; for (j = 0; j < stream->props.u.isoc.framesperurb; j++) { urb->iso_frame_desc[j].offset = frame_offset; urb->iso_frame_desc[j].length = stream->props.u.isoc.framesize; frame_offset += stream->props.u.isoc.framesize; } stream->urbs_initialized++; } return 0; } int usb_urb_init(struct usb_data_stream *stream, struct usb_data_stream_properties *props) { if (stream == NULL || props == NULL) return -EINVAL; memcpy(&stream->props, props, sizeof(*props)); usb_clear_halt(stream->udev,usb_rcvbulkpipe(stream->udev,stream->props.endpoint)); if (stream->complete == NULL) { err("there is no data callback - this doesn't make sense."); return -EINVAL; } switch (stream->props.type) { case USB_BULK: return usb_bulk_urb_init(stream); case USB_ISOC: return usb_isoc_urb_init(stream); default: err("unknown URB-type for data transfer."); return -EINVAL; } } int usb_urb_exit(struct usb_data_stream *stream) { int i; usb_urb_kill(stream); for (i = 0; i < stream->urbs_initialized; i++) { if (stream->urb_list[i] != NULL) { deb_mem("freeing URB no. %d.\n",i); /* free the URBs */ usb_free_urb(stream->urb_list[i]); } } stream->urbs_initialized = 0; usb_free_stream_buffers(stream); return 0; } |
5 5 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP virtual clock driver * * Copyright 2021 NXP */ #include <linux/slab.h> #include <linux/hashtable.h> #include "ptp_private.h" #define PTP_VCLOCK_CC_SHIFT 31 #define PTP_VCLOCK_CC_MULT (1 << PTP_VCLOCK_CC_SHIFT) #define PTP_VCLOCK_FADJ_SHIFT 9 #define PTP_VCLOCK_FADJ_DENOMINATOR 15625ULL #define PTP_VCLOCK_REFRESH_INTERVAL (HZ * 2) /* protects vclock_hash addition/deletion */ static DEFINE_SPINLOCK(vclock_hash_lock); static DEFINE_READ_MOSTLY_HASHTABLE(vclock_hash, 8); static void ptp_vclock_hash_add(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_add_head_rcu(&vclock->vclock_hash_node, &vclock_hash[vclock->clock->index % HASH_SIZE(vclock_hash)]); spin_unlock(&vclock_hash_lock); } static void ptp_vclock_hash_del(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_del_init_rcu(&vclock->vclock_hash_node); spin_unlock(&vclock_hash_lock); synchronize_rcu(); } static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) { struct ptp_vclock *vclock = info_to_vclock(ptp); s64 adj; adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT; adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_read(&vclock->tc); vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj; mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct ptp_vclock *vclock = info_to_vclock(ptp); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_adjtime(&vclock->tc, delta); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_read(&vclock->tc); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; struct timespec64 pts; int err; u64 ns; err = pptp->info->getcyclesx64(pptp->info, &pts, sts); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, timespec64_to_ns(&pts)); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns = timespec64_to_ns(ts); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_init(&vclock->tc, &vclock->cc, ns); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *xtstamp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; int err; u64 ns; err = pptp->info->getcrosscycles(pptp->info, xtstamp); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, ktime_to_ns(xtstamp->device)); mutex_unlock(&vclock->lock); xtstamp->device = ns_to_ktime(ns); return 0; } static long ptp_vclock_refresh(struct ptp_clock_info *ptp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct timespec64 ts; ptp_vclock_gettime(&vclock->info, &ts); return PTP_VCLOCK_REFRESH_INTERVAL; } static const struct ptp_clock_info ptp_vclock_info = { .owner = THIS_MODULE, .name = "ptp virtual clock", .max_adj = 500000000, .adjfine = ptp_vclock_adjfine, .adjtime = ptp_vclock_adjtime, .settime64 = ptp_vclock_settime, .do_aux_work = ptp_vclock_refresh, }; static u64 ptp_vclock_read(const struct cyclecounter *cc) { struct ptp_vclock *vclock = cc_to_vclock(cc); struct ptp_clock *ptp = vclock->pclock; struct timespec64 ts = {}; ptp->info->getcycles64(ptp->info, &ts); return timespec64_to_ns(&ts); } static const struct cyclecounter ptp_vclock_cc = { .read = ptp_vclock_read, .mask = CYCLECOUNTER_MASK(32), .mult = PTP_VCLOCK_CC_MULT, .shift = PTP_VCLOCK_CC_SHIFT, }; struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) { struct ptp_vclock *vclock; vclock = kzalloc(sizeof(*vclock), GFP_KERNEL); if (!vclock) return NULL; vclock->pclock = pclock; vclock->info = ptp_vclock_info; if (pclock->info->getcyclesx64) vclock->info.gettimex64 = ptp_vclock_gettimex; else vclock->info.gettime64 = ptp_vclock_gettime; if (pclock->info->getcrosscycles) vclock->info.getcrosststamp = ptp_vclock_getcrosststamp; vclock->cc = ptp_vclock_cc; snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt", pclock->index); INIT_HLIST_NODE(&vclock->vclock_hash_node); mutex_init(&vclock->lock); vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev); if (IS_ERR_OR_NULL(vclock->clock)) { kfree(vclock); return NULL; } timecounter_init(&vclock->tc, &vclock->cc, 0); ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); ptp_vclock_hash_add(vclock); return vclock; } void ptp_vclock_unregister(struct ptp_vclock *vclock) { ptp_vclock_hash_del(vclock); ptp_clock_unregister(vclock->clock); kfree(vclock); } #if IS_BUILTIN(CONFIG_PTP_1588_CLOCK) int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { char name[PTP_CLOCK_NAME_LEN] = ""; struct ptp_clock *ptp; struct device *dev; int num = 0; if (pclock_index < 0) return num; snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index); dev = class_find_device_by_name(&ptp_class, name); if (!dev) return num; ptp = dev_get_drvdata(dev); if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) { put_device(dev); return num; } *vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL); if (!(*vclock_index)) goto out; memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks); num = ptp->n_vclocks; out: mutex_unlock(&ptp->n_vclocks_mux); put_device(dev); return num; } EXPORT_SYMBOL(ptp_get_vclocks_index); ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { unsigned int hash = vclock_index % HASH_SIZE(vclock_hash); struct ptp_vclock *vclock; u64 ns; u64 vclock_ns = 0; ns = ktime_to_ns(*hwtstamp); rcu_read_lock(); hlist_for_each_entry_rcu(vclock, &vclock_hash[hash], vclock_hash_node) { if (vclock->clock->index != vclock_index) continue; if (mutex_lock_interruptible(&vclock->lock)) break; vclock_ns = timecounter_cyc2time(&vclock->tc, ns); mutex_unlock(&vclock->lock); break; } rcu_read_unlock(); return ns_to_ktime(vclock_ns); } EXPORT_SYMBOL(ptp_convert_timestamp); #endif |
1 2 77 1 4 1 4 1 2 58 4 1 4 4 4 2 2 2 58 58 7 58 58 58 404 406 405 6 354 58 355 355 355 6 342 1 341 41 72 2 68 15 8 8 2 63 58 77 77 76 76 75 77 77 75 2 77 74 1 9 76 1 1 74 2 34 50 51 48 1 4 48 4 4 33 1 75 4 75 72 73 2 4 70 20 58 9 58 58 7 1 1 58 68 1 5 65 3 65 67 1 68 17 19 19 34 63 64 63 64 64 2 180 7 181 75 64 32 32 76 26 74 49 2 4 10 2 2 2 4 4 4 4 4 97 93 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 10 10 10 10 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 10 10 10 10 10 10 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 6 6 6 1 6 6 5 5 6 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/module.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/vmpressure.h> #include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> /* for buffer_heads_over_limit */ #include <linux/mm_inline.h> #include <linux/backing-dev.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/compaction.h> #include <linux/notifier.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> #include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> #include <linux/memory-tiers.h> #include <linux/oom.h> #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/printk.h> #include <linux/dax.h> #include <linux/psi.h> #include <linux/pagewalk.h> #include <linux/shmem_fs.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include <linux/swapops.h> #include <linux/balloon_compaction.h> #include <linux/sched/sysctl.h> #include "internal.h" #include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes * are scanned. */ nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. */ struct mem_cgroup *target_mem_cgroup; /* * Scan pressure balancing between anon and file LRUs */ unsigned long anon_cost; unsigned long file_cost; /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsigned int may_deactivate:2; unsigned int force_deactivate:1; unsigned int skipped_deactivate:1; /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; /* Can mapped folios be reclaimed? */ unsigned int may_unmap:1; /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; /* Not allow cache_trim_mode to be turned on as part of reclaim? */ unsigned int no_cache_trim_mode:1; /* Has cache_trim_mode failed at least once? */ unsigned int cache_trim_mode_failed:1; /* Proactive reclaim invoked by userspace through memory.reclaim */ unsigned int proactive:1; /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at * reduced force or passed over entirely due to its memory.low * setting (memcg_low_skipped), and nothing is reclaimed as a * result, then go back for one more cycle that reclaims the protected * memory (memcg_low_reclaim) to avert OOM. */ unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; /* There is easily reclaimable cold cache in the current node */ unsigned int cache_trim_mode:1; /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; /* Allocation order */ s8 order; /* Scan (total_size >> priority) pages at once */ s8 priority; /* The highest zone to isolate folios for reclaim from */ s8 reclaim_idx; /* This context's GFP mask */ gfp_t gfp_mask; /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; struct { unsigned int dirty; unsigned int unqueued_dirty; unsigned int congested; unsigned int writeback; unsigned int immediate; unsigned int file_taken; unsigned int taken; } nr; /* for recording the reclaimed slab by now */ struct reclaim_state reclaim_state; }; #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ if ((_folio)->lru.prev != _base) { \ struct folio *prev; \ \ prev = lru_to_folio(&(_folio->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) #else #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif /* * From 0 .. 200. Higher means more swappy. */ int vm_swappiness = 60; #ifdef CONFIG_MEMCG /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } /* * Returns true for reclaim on the root cgroup. This is true for direct * allocator reclaim and reclaim through cgroup interfaces on the root cgroup. */ static bool root_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup); } /** * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ static bool writeback_throttling_sane(struct scan_control *sc) { if (!cgroup_reclaim(sc)) return true; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return true; #endif return false; } #else static bool cgroup_reclaim(struct scan_control *sc) { return false; } static bool root_reclaim(struct scan_control *sc) { return true; } static bool writeback_throttling_sane(struct scan_control *sc) { return true; } #endif static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { /* Check for an overwrite */ WARN_ON_ONCE(rs && task->reclaim_state); /* Check for the nulling of an already-nulled member */ WARN_ON_ONCE(!rs && !task->reclaim_state); task->reclaim_state = rs; } /* * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to * scan_control->nr_reclaimed. */ static void flush_reclaim_state(struct scan_control *sc) { /* * Currently, reclaim_state->reclaimed includes three types of pages * freed outside of vmscan: * (1) Slab pages. * (2) Clean file pages from pruned inodes (on highmem systems). * (3) XFS freed buffer pages. * * For all of these cases, we cannot universally link the pages to a * single memcg. For example, a memcg-aware shrinker can free one object * charged to the target memcg, causing an entire page to be freed. * If we count the entire page as reclaimed from the memcg, we end up * overestimating the reclaimed amount (potentially under-reclaiming). * * Only count such pages for global reclaim to prevent under-reclaiming * from the target memcg; preventing unnecessary retries during memcg * charging and false positives from proactive reclaim. * * For uncommon cases where the freed pages were actually mostly * charged to the target memcg, we end up underestimating the reclaimed * amount. This should be fine. The freed pages will be uncharged * anyway, even if they are not counted here properly, and we will be * able to make forward progress in charging (which is usually in a * retry loop). * * We can go one step further, and report the uncharged objcg pages in * memcg reclaim, to make reporting more accurate and reduce * underestimation, but it's probably not worth the complexity for now. */ if (current->reclaim_state && root_reclaim(sc)) { sc->nr_reclaimed += current->reclaim_state->reclaimed; current->reclaim_state->reclaimed = 0; } } static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) return false; if (sc && sc->no_demotion) return false; if (next_demotion_node(nid) == NUMA_NO_NODE) return false; return true; } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, int nid, struct scan_control *sc) { if (memcg == NULL) { /* * For non-memcg reclaim, is there * space in any swap device? */ if (get_nr_swap_pages() > 0) return true; } else { /* Is the memcg below its swap limit? */ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) return true; } /* * The page can not be swapped. * * Can it be reclaimed from this node via demotion? */ return can_demote(nid, sc); } /* * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is * not expected that isolated folios will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); return nr; } /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; for (zid = 0; zid <= zone_idx; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; if (!managed_zone(zone)) continue; if (!mem_cgroup_disabled()) size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); } return size; } static unsigned long drop_slab_node(int nid) { unsigned long freed = 0; struct mem_cgroup *memcg = NULL; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); return freed; } void drop_slab(void) { int nid; int shift = 0; unsigned long freed; do { freed = 0; for_each_online_node(nid) { if (fatal_signal_pending(current)) return; freed += drop_slab_node(nid); } } while ((freed >> shift++) > 1); } static int reclaimer_offset(void) { BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != PGSCAN_DIRECT - PGSCAN_KSWAPD); BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); if (current_is_kswapd()) return 0; if (current_is_khugepaged()) return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } static inline int is_page_cache_freeable(struct folio *folio) { /* * A freeable page cache folio is referenced only by the caller * that isolated the folio, the page cache and optional filesystem * private data at folio->private. */ return folio_ref_count(folio) - folio_test_private(folio) == 1 + folio_nr_pages(folio); } /* * We detected a synchronous write error writing a folio out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the folio and once * that folio is locked, the mapping is pinned. * * We're allowed to run sleeping folio_lock() here because we know the caller has * __GFP_FS. */ static void handle_write_error(struct address_space *mapping, struct folio *folio, int error) { folio_lock(folio); if (folio_mapping(folio) == mapping) mapping_set_error(mapping, error); folio_unlock(folio); } static bool skip_throttle_noprogress(pg_data_t *pgdat) { int reclaimable = 0, write_pending = 0; int i; /* * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; /* * If there are a lot of dirty/writeback folios then do not * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; reclaimable += zone_reclaimable_pages(zone); write_pending += zone_page_state_snapshot(zone, NR_ZONE_WRITE_PENDING); } if (2 * write_pending <= reclaimable) return true; return false; } void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) { wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; long timeout, ret; DEFINE_WAIT(wait); /* * Do not throttle user workers, kthreads other than kswapd or * workqueues. They may be required for reclaim to make * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && current->flags & (PF_USER_WORKER|PF_KTHREAD)) { cond_resched(); return; } /* * These figures are pulled out of thin air. * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many * parallel reclaimers which is a short-lived event so the timeout is * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU. */ switch(reason) { case VMSCAN_THROTTLE_WRITEBACK: timeout = HZ/10; if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { WRITE_ONCE(pgdat->nr_reclaim_start, node_page_state(pgdat, NR_THROTTLED_WRITTEN)); } break; case VMSCAN_THROTTLE_CONGESTED: fallthrough; case VMSCAN_THROTTLE_NOPROGRESS: if (skip_throttle_noprogress(pgdat)) { cond_resched(); return; } timeout = 1; break; case VMSCAN_THROTTLE_ISOLATED: timeout = HZ/50; break; default: WARN_ON_ONCE(1); timeout = HZ; break; } prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = schedule_timeout(timeout); finish_wait(wqh, &wait); if (reason == VMSCAN_THROTTLE_WRITEBACK) atomic_dec(&pgdat->nr_writeback_throttled); trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), jiffies_to_usecs(timeout - ret), reason); } /* * Account for folios written if tasks are throttled waiting on dirty * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks. */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled) { unsigned long nr_written; node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); /* * This is an inaccurate read as the per-cpu deltas may not * be synchronised. However, given that the system is * writeback throttled, it is not worth taking the penalty * of getting an accurate count. At worst, the throttle * timeout guarantees forward progress. */ nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - READ_ONCE(pgdat->nr_reclaim_start); if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); } /* possible outcome of pageout() */ typedef enum { /* failed to write folio out, folio is locked */ PAGE_KEEP, /* move folio to the active list, folio is locked */ PAGE_ACTIVATE, /* folio has been sent to the disk successfully, folio is unlocked */ PAGE_SUCCESS, /* folio is clean and locked */ PAGE_CLEAN, } pageout_t; /* * pageout is called by shrink_folio_list() for each dirty folio. * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug) { /* * If the folio is dirty, only perform writeback if that write * will be non-blocking. To prevent this allocation from being * stalled by pagecache activity. But note that there may be * stalls if we need to run get_block(). We could test * PagePrivate for that. * * If this process is currently in __generic_file_write_iter() against * this folio's queue, we can perform writeback even if that * will block. * * If the folio is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. */ if (!is_page_cache_freeable(folio)) return PAGE_KEEP; if (!mapping) { /* * Some data journaling orphaned folios can have * folio->mapping == NULL while being dirty with clean buffers. */ if (folio_test_private(folio)) { if (try_to_free_buffers(folio)) { folio_clear_dirty(folio); pr_info("%s: orphaned folio\n", __func__); return PAGE_CLEAN; } } return PAGE_KEEP; } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; if (folio_clear_dirty_for_io(folio)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, .for_reclaim = 1, .swap_plug = plug, }; folio_set_reclaim(folio); res = mapping->a_ops->writepage(&folio->page, &wbc); if (res < 0) handle_write_error(mapping, folio, res); if (res == AOP_WRITEPAGE_ACTIVATE) { folio_clear_reclaim(folio); return PAGE_ACTIVATE; } if (!folio_test_writeback(folio)) { /* synchronous write or broken a_ops? */ folio_clear_reclaim(folio); } trace_mm_vmscan_write_folio(folio); node_stat_add_folio(folio, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } return PAGE_CLEAN; } /* * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct folio *folio, bool reclaimed, struct mem_cgroup *target_memcg) { int refcount; void *shadow = NULL; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); if (!folio_test_swapcache(folio)) spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has * a ref to the folio, it may be possible that they dirty it then * drop the reference. So if the dirty flag is tested before the * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); * !folio_test_dirty(folio) [good] * folio_set_dirty(folio); * folio_put(folio); * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the folio->flags * load is not satisfied before that of folio->_refcount. * * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required. */ refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) { folio_ref_unfreeze(folio, refcount); goto cannot_free; } if (folio_test_swapcache(folio)) { swp_entry_t swap = folio->swap; if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); __delete_from_swap_cache(folio, swap, shadow); mem_cgroup_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); free_folio = mapping->a_ops->free_folio; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. * * But don't store shadows in an address space that is * already exiting. This is not just an optimization, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. * * We also don't store shadows for DAX mappings because the * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space. */ if (reclaimed && folio_is_file_lru(folio) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(folio, target_memcg); __filemap_remove_folio(folio, shadow); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); if (free_folio) free_folio(folio); } return 1; cannot_free: xa_unlock_irq(&mapping->i_pages); if (!folio_test_swapcache(folio)) spin_unlock(&mapping->host->i_lock); return 0; } /** * remove_mapping() - Attempt to remove a folio from its mapping. * @mapping: The address space. * @folio: The folio to remove. * * If the folio is dirty, under writeback or if someone else has a ref * on it, removal will fail. * Return: The number of pages removed from the mapping. 0 if the folio * could not be removed. * Context: The caller should have a single refcount on the folio and * hold its lock. */ long remove_mapping(struct address_space *mapping, struct folio *folio) { if (__remove_mapping(mapping, folio, false, NULL)) { /* * Unfreezing the refcount with 1 effectively * drops the pagecache ref for us without requiring another * atomic operation. */ folio_ref_unfreeze(folio, 1); return folio_nr_pages(folio); } return 0; } /** * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. * @folio: Folio to be returned to an LRU list. * * Add previously isolated @folio to appropriate LRU list. * The folio may still be unevictable for other reasons. * * Context: lru_lock must not be held, interrupts must be enabled. */ void folio_putback_lru(struct folio *folio) { folio_add_lru(folio); folio_put(folio); /* drop ref from isolate */ } enum folio_references { FOLIOREF_RECLAIM, FOLIOREF_RECLAIM_CLEAN, FOLIOREF_KEEP, FOLIOREF_ACTIVATE, }; static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; unsigned long vm_flags; referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE; /* rmap lock contention: rotate */ if (referenced_ptes == -1) return FOLIOREF_KEEP; if (referenced_ptes) { /* * All mapped folios start out with page table * references from the instantiating fault, so we need * to look twice if a mapped file/anon folio is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * * Note: the mark is set for activated folios as well * so that recently deactivated but used folios are * quickly recovered. */ folio_set_referenced(folio); if (referenced_folio || referenced_ptes > 1) return FOLIOREF_ACTIVATE; /* * Activate file-backed executable folios after first usage. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) return FOLIOREF_ACTIVATE; return FOLIOREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) return FOLIOREF_RECLAIM_CLEAN; return FOLIOREF_RECLAIM; } /* Check if a folio is dirty or under writeback */ static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; /* * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed. */ if (!folio_is_file_lru(folio) || (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { *dirty = false; *writeback = false; return; } /* By default assume that the folio flags are accurate */ *dirty = folio_test_dirty(folio); *writeback = folio_test_writeback(folio); /* Verify dirty/writeback state if the filesystem supports it */ if (!folio_test_private(folio)) return; mapping = folio_mapping(folio); if (mapping && mapping->a_ops->is_dirty_writeback) mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } static struct folio *alloc_demote_folio(struct folio *src, unsigned long private) { struct folio *dst; nodemask_t *allowed_mask; struct migration_target_control *mtc; mtc = (struct migration_target_control *)private; allowed_mask = mtc->nmask; /* * make sure we allocate from the target node first also trying to * demote or reclaim pages from the target node via kswapd if we are * low on free memory on target node. If we don't do this and if * we have free memory on the slower(lower) memtier, we would start * allocating pages from slower(lower) memory tiers without even forcing * a demotion of cold pages from the target memtier. This can result * in the kernel placing hot pages in slower(lower) memory tiers. */ mtc->nmask = NULL; mtc->gfp_mask |= __GFP_THISNODE; dst = alloc_migration_target(src, (unsigned long)mtc); if (dst) return dst; mtc->gfp_mask &= ~__GFP_THISNODE; mtc->nmask = allowed_mask; return alloc_migration_target(src, (unsigned long)mtc); } /* * Take folios on @demote_folios and attempt to demote them to another node. * Folios which are not demoted are left on @demote_folios. */ static unsigned int demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat) { int target_nid = next_demotion_node(pgdat->node_id); unsigned int nr_succeeded; nodemask_t allowed_mask; struct migration_target_control mtc = { /* * Allocate from 'node', or fail quickly and quietly. * When this happens, 'page' will likely just be discarded * instead of migrated. */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, .nmask = &allowed_mask }; if (list_empty(demote_folios)) return 0; if (target_nid == NUMA_NO_NODE) return 0; node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); return nr_succeeded; } static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) { if (gfp_mask & __GFP_FS) return true; if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) return false; /* * We can "enter_fs" for swap-cache with only __GFP_IO * providing this isn't SWP_FS_OPS. * ->flags can be updated non-atomicially (scan_swap_map_slots), * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); } /* * shrink_folio_list() returns the number of reclaimed pages */ static unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { struct folio_batch free_folios; LIST_HEAD(ret_folios); LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; struct swap_iocb *plug = NULL; folio_batch_init(&free_folios); memset(stat, 0, sizeof(*stat)); cond_resched(); do_demote_pass = can_demote(pgdat->node_id, sc); retry: while (!list_empty(folio_list)) { struct address_space *mapping; struct folio *folio; enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; cond_resched(); folio = lru_to_folio(folio_list); list_del(&folio->lru); if (!folio_trylock(folio)) goto keep; VM_BUG_ON_FOLIO(folio_test_active(folio), folio); nr_pages = folio_nr_pages(folio); /* Account the number of base pages */ sc->nr_scanned += nr_pages; if (unlikely(!folio_evictable(folio))) goto activate_locked; if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; /* folio_update_gen() tried to promote this page? */ if (lru_gen_enabled() && !ignore_references && folio_mapped(folio) && folio_test_referenced(folio)) goto keep_locked; /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing * folios if the tail of the LRU is all dirty unqueued folios. */ folio_check_dirty_writeback(folio, &dirty, &writeback); if (dirty || writeback) stat->nr_dirty += nr_pages; if (dirty && !writeback) stat->nr_unqueued_dirty += nr_pages; /* * Treat this folio as congested if folios are cycling * through the LRU so quickly that the folios marked * for immediate reclaim are making it to the end of * the LRU a second time. */ if (writeback && folio_test_reclaim(folio)) stat->nr_congested += nr_pages; /* * If a folio at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number * of folios under writeback and this folio has both * the writeback and reclaim flags set, then it * indicates that folios are being queued for I/O but * are being recycled through the LRU before the I/O * can complete. Waiting on the folio itself risks an * indefinite stall if it is impossible to writeback * the folio due to I/O error or disconnected storage * so instead note that the LRU is being scanned too * quickly and the caller can stall after the folio * list has been processed. * * 2) Global or new memcg reclaim encounters a folio that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs). In this case mark the folio for immediate * reclaim and continue scanning. * * Require may_enter_fs() because we would wait on fs, which * may not have submitted I/O yet. And the loop driver might * enter reclaim, and deadlock if it waits on a folio for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a folio that already has the * reclaim flag set. memcg does not have any dirty folio * throttling so we could easily OOM just because too many * folios are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * * In cases 1) and 2) we activate the folios to get them out of * the way while we continue scanning for clean folios on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. * Since they're marked for immediate reclaim, they won't put * memory pressure on the cache working set any longer than it * takes to write them to disk. */ if (folio_test_writeback(folio)) { /* Case 1 above */ if (current_is_kswapd() && folio_test_reclaim(folio) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { stat->nr_immediate += nr_pages; goto activate_locked; /* Case 2 above */ } else if (writeback_throttling_sane(sc) || !folio_test_reclaim(folio) || !may_enter_fs(folio, sc->gfp_mask)) { /* * This is slightly racy - * folio_end_writeback() might have * just cleared the reclaim flag, then * setting the reclaim flag here ends up * interpreted as the readahead flag - but * that does not matter enough to care. * What we do want is for this folio to * have the reclaim flag set next time * memcg reclaim reaches the tests above, * so it will then wait for writeback to * avoid OOM; and it's also appropriate * in global reclaim. */ folio_set_reclaim(folio); stat->nr_writeback += nr_pages; goto activate_locked; /* Case 3 above */ } else { folio_unlock(folio); folio_wait_writeback(folio); /* then go back and try same folio again */ list_add_tail(&folio->lru, folio_list); continue; } } if (!ignore_references) references = folio_check_references(folio, sc); switch (references) { case FOLIOREF_ACTIVATE: goto activate_locked; case FOLIOREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; case FOLIOREF_RECLAIM: case FOLIOREF_RECLAIM_CLEAN: ; /* try to reclaim the folio below */ } /* * Before reclaiming the folio, try to relocate * its contents to another node. */ if (do_demote_pass && (thp_migration_supported() || !folio_test_large(folio))) { list_add(&folio->lru, &demote_folios); folio_unlock(folio); continue; } /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree folio could be freed directly */ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { if (!folio_test_swapcache(folio)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (folio_maybe_dma_pinned(folio)) goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ if (!can_split_folio(folio, NULL)) goto activate_locked; /* * Split folios without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ if (!folio_entire_mapcount(folio) && split_folio_to_list(folio, folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { if (!folio_test_large(folio)) goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split; } } } else if (folio_test_swapbacked(folio) && folio_test_large(folio)) { /* Split shmem folio */ if (split_folio_to_list(folio, folio_list)) goto keep_locked; } /* * If the folio was split above, the tail pages will make * their own pass through this function and be accounted * then. */ if ((nr_pages > 1) && !folio_test_large(folio)) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } /* * The folio is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (folio_mapped(folio)) { enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = folio_test_swapbacked(folio); if (folio_test_pmd_mappable(folio)) flags |= TTU_SPLIT_HUGE_PMD; try_to_unmap(folio, flags); if (folio_mapped(folio)) { stat->nr_unmap_fail += nr_pages; if (!was_swapbacked && folio_test_swapbacked(folio)) stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } /* * Folio is unmapped now so it cannot be newly pinned anymore. * No point in trying to reclaim folio if it is pinned. * Furthermore we don't want to reclaim underlying fs metadata * if the folio is pinned and thus potentially modified by the * pinning process as that may upset the filesystem. */ if (folio_maybe_dma_pinned(folio)) goto activate_locked; mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { /* * Only kswapd can writeback filesystem folios * to avoid risk of stack overflow. But avoid * injecting inefficient single-folio I/O into * flusher writeback as much as possible: only * write folios when we've encountered many * dirty folios, and when we've already scanned * the rest of the LRU for clean folios and see * the same dirty folios again (with the reclaim * flag set). */ if (folio_is_file_lru(folio) && (!current_is_kswapd() || !folio_test_reclaim(folio) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principle to folio_deactivate() * except we already have the folio isolated * and know it's dirty */ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, nr_pages); folio_set_reclaim(folio); goto activate_locked; } if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; if (!sc->may_writepage) goto keep_locked; /* * Folio is dirty. Flush the TLB if a writable entry * potentially exists to avoid CPU writes after I/O * starts and then write it out here. */ try_to_unmap_flush_dirty(); switch (pageout(folio, mapping, &plug)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: stat->nr_pageout += nr_pages; if (folio_test_writeback(folio)) goto keep; if (folio_test_dirty(folio)) goto keep; /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the folio. */ if (!folio_trylock(folio)) goto keep; if (folio_test_dirty(folio) || folio_test_writeback(folio)) goto keep_locked; mapping = folio_mapping(folio); fallthrough; case PAGE_CLEAN: ; /* try to free the folio below */ } } /* * If the folio has buffers, try to free the buffer * mappings associated with this folio. If we succeed * we try to free the folio as well. * * We do this even if the folio is dirty. * filemap_release_folio() does not perform I/O, but it * is possible for a folio to have the dirty flag set, * but it is actually clean (all its buffers are clean). * This happens if the buffers were written out directly, * with submit_bh(). ext3 will do this, as well as * the blockdev mapping. filemap_release_folio() will * discover that cleanness and will drop the buffers * and mark the folio clean - it can be freed. * * Rarely, folios can have buffers and no ->mapping. * These are the folios which were not successfully * invalidated in truncate_cleanup_folio(). We try to * drop those buffers here and if that worked, and the * folio is no longer mapped into process address space * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable. */ if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) { folio_unlock(folio); if (folio_put_testzero(folio)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free * this folio shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ nr_reclaimed += nr_pages; continue; } } } if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { /* follow __remove_mapping for reference */ if (!folio_ref_freeze(folio, 1)) goto keep_locked; /* * The folio has only one reference left, which is * from the isolation. After the caller puts the * folio back on the lru and drops the reference, the * folio will be freed anyway. It doesn't matter * which lru it goes on. So we don't bother checking * the dirty flag here. */ count_vm_events(PGLAZYFREED, nr_pages); count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); } else if (!mapping || !__remove_mapping(mapping, folio, true, sc->target_mem_cgroup)) goto keep_locked; folio_unlock(folio); free_it: /* * Folio may get swapped out as a whole, need to account * all pages in it. */ nr_reclaimed += nr_pages; if (folio_test_large(folio) && folio_test_large_rmappable(folio)) folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); free_unref_folios(&free_folios); } continue; activate_locked_split: /* * The tail pages that are failed to add into swap cache * reach here. Fixup nr_scanned and nr_pages. */ if (nr_pages > 1) { sc->nr_scanned -= (nr_pages - 1); nr_pages = 1; } activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) && (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio); folio_set_active(folio); stat->nr_activate[type] += nr_pages; count_memcg_folio_events(folio, PGACTIVATE, nr_pages); } keep_locked: folio_unlock(folio); keep: list_add(&folio->lru, &ret_folios); VM_BUG_ON_FOLIO(folio_test_lru(folio) || folio_test_unevictable(folio), folio); } /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ nr_reclaimed += demote_folio_list(&demote_folios, pgdat); /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ list_splice_init(&demote_folios, folio_list); /* * goto retry to reclaim the undemoted folios in folio_list if * desired. * * Reclaiming directly from top tier nodes is not often desired * due to it breaking the LRU ordering: in general memory * should be reclaimed from lower tier nodes and demoted from * top tier nodes. * * However, disabling reclaim from top tier nodes entirely * would cause ooms in edge scenarios where lower tier memory * is unreclaimable for whatever reason, eg memory being * mlocked or too hot to reclaim. We can disable reclaim * from top tier nodes in proactive reclaim though as that is * not real memory pressure. */ if (!sc->proactive) { do_demote_pass = false; goto retry; } } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); free_unref_folios(&free_folios); list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); if (plug) swap_write_unplug(plug); return nr_reclaimed; } unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, }; struct reclaim_stat stat; unsigned int nr_reclaimed; struct folio *folio, *next; LIST_HEAD(clean_folios); unsigned int noreclaim_flag; list_for_each_entry_safe(folio, next, folio_list, lru) { if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && !folio_test_dirty(folio) && !__folio_test_movable(folio) && !folio_test_unevictable(folio)) { folio_clear_active(folio); list_move(&folio->lru, &clean_folios); } } /* * We should be safe here since we are only dealing with file pages and * we are not kswapd and therefore cannot write dirty file pages. But * call memalloc_noreclaim_save() anyway, just in case these conditions * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); list_splice(&clean_folios, folio_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); /* * Since lazyfree pages are isolated from file LRU from the beginning, * they will rotate back to anonymous LRU in the end if it failed to * discard so isolated count will be mismatched. * Compensate the isolated count for both LRU lists. */ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, stat.nr_lazyfree_fail); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)stat.nr_lazyfree_fail); return nr_reclaimed; } /* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) { int zid; for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_zone_taken[zid]) continue; update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } } #ifdef CONFIG_CMA /* * It is waste of effort to scan and reclaim CMA pages if it is not available * for current allocation context. Kswapd can not be enrolled as it can not * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL */ static bool skip_cma(struct folio *folio, struct scan_control *sc) { return !current_is_kswapd() && gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && folio_migratetype(folio) == MIGRATE_CMA; } #else static bool skip_cma(struct folio *folio, struct scan_control *sc) { return false; } #endif /* * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. * * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * * Lru_lock must be held before calling this function. * * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; LIST_HEAD(folios_skipped); total_scan = 0; scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; struct folio *folio; folio = lru_to_folio(src); prefetchw_prev_lru_folio(folio, src, flags); nr_pages = folio_nr_pages(folio); total_scan += nr_pages; if (folio_zonenum(folio) > sc->reclaim_idx || skip_cma(folio, sc)) { nr_skipped[folio_zonenum(folio)] += nr_pages; move_to = &folios_skipped; goto move; } /* * Do not count skipped folios because that makes the function * return with no isolated folios if the LRU mostly contains * ineligible folios. This causes the VM to not reclaim any * folios, triggering a premature OOM. * Account all pages in a folio. */ scan += nr_pages; if (!folio_test_lru(folio)) goto move; if (!sc->may_unmap && folio_mapped(folio)) goto move; /* * Be careful not to clear the lru flag until after we're * sure the folio is not being freed elsewhere -- the * folio release code relies on it. */ if (unlikely(!folio_try_get(folio))) goto move; if (!folio_test_clear_lru(folio)) { /* Another thread is already isolating this folio */ folio_put(folio); goto move; } nr_taken += nr_pages; nr_zone_taken[folio_zonenum(folio)] += nr_pages; move_to = dst; move: list_move(&folio->lru, move_to); } /* * Splice any skipped folios to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX * scanning would soon rescan the same folios to skip and waste lots * of cpu cycles. */ if (!list_empty(&folios_skipped)) { int zid; list_splice(&folios_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } } *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, total_scan, skipped, nr_taken, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } /** * folio_isolate_lru() - Try to isolate a folio from its LRU list. * @folio: Folio to isolate from its LRU list. * * Isolate a @folio from an LRU list and adjust the vmstat statistic * corresponding to whatever LRU list the folio was on. * * The folio will have its LRU flag cleared. If it was found on the * active list, it will have the Active flag set. If it was found on the * unevictable list, it will have the Unevictable flag set. These flags * may need to be cleared by the caller before letting the page go. * * Context: * * (1) Must be called with an elevated refcount on the folio. This is a * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. * * Return: true if the folio was removed from an LRU list. * false if the folio was not on an LRU list. */ bool folio_isolate_lru(struct folio *folio) { bool ret = false; VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); if (folio_test_clear_lru(folio)) { struct lruvec *lruvec; folio_get(folio); lruvec = folio_lruvec_lock_irq(folio); lruvec_del_folio(lruvec, folio); unlock_page_lruvec_irq(lruvec); ret = true; } return ret; } /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ static bool too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; bool too_many; if (current_is_kswapd()) return false; if (!writeback_throttling_sane(sc)) return false; if (file) { inactive = node_page_state(pgdat, NR_INACTIVE_FILE); isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { inactive = node_page_state(pgdat, NR_INACTIVE_ANON); isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ if (gfp_has_io_fs(sc->gfp_mask)) inactive >>= 3; too_many = isolated > inactive; /* Wake up tasks throttled due to too_many_isolated. */ if (!too_many) wake_throttle_isolated(pgdat); return too_many; } /* * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * * Returns the number of pages moved to the given lruvec. */ static unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; struct folio_batch free_folios; folio_batch_init(&free_folios); while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); list_del(&folio->lru); if (unlikely(!folio_evictable(folio))) { spin_unlock_irq(&lruvec->lru_lock); folio_putback_lru(folio); spin_lock_irq(&lruvec->lru_lock); continue; } /* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock * folio_set_lru() * list_add(&folio->lru,) * list_add(&folio->lru,) */ folio_set_lru(folio); if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); if (folio_test_large(folio) && folio_test_large_rmappable(folio)) folio_undo_large_rmappable(folio); if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); } continue; } /* * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration). */ VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; if (folio_test_active(folio)) workingset_age_nonresident(lruvec, nr_pages); } if (free_folios.nr) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); free_unref_folios(&free_folios); spin_lock_irq(&lruvec->lru_lock); } return nr_moved; } /* * If a kernel thread (such as nfsd for loop-back mounts) services a backing * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case * we should not throttle. Otherwise it is safe to do so. */ static int current_may_throttle(void) { return !(current->flags & PF_LOCAL_THROTTLE); } /* * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { LIST_HEAD(folio_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { if (stalled) return 0; /* wait a bit for the reclaimer. */ stalled = true; reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) return SWAP_CLUSTER_MAX; } lru_add_drain(); spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); /* * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ if (stat.nr_unqueued_dirty == nr_taken) { wakeup_flusher_threads(WB_REASON_VMSCAN); /* * For cgroupv1 dirty throttling is achieved by waking up * the kernel flusher here and later waiting on folios * which are in writeback to finish (see shrink_folio_list()). * * Flusher may not be able to issue writeback quickly * enough for cgroupv1 writeback throttling to work * on a large system. */ if (!writeback_throttling_sane(sc)) reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } sc->nr.dirty += stat.nr_dirty; sc->nr.congested += stat.nr_congested; sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr.writeback += stat.nr_writeback; sc->nr.immediate += stat.nr_immediate; sc->nr.taken += nr_taken; if (file) sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } /* * shrink_active_list() moves folios from the active LRU to the inactive LRU. * * We move them the other way if the folio is referenced by one or more * processes. * * If the folios are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if * the folios are mapped, the processing is slow (folio_referenced()), so * we should drop lru_lock around each folio. It's impossible to balance * this, so instead we remove the folios from the LRU while processing them. * It is safe to rely on the active flag against the non-LRU folios in here * because nobody will play with that bit on a non-LRU folio. * * The downside is that we have to touch folio->_refcount against each folio. * But we had to alter folio->flags anyway. */ static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { unsigned long nr_taken; unsigned long nr_scanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; bool file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { struct folio *folio; cond_resched(); folio = lru_to_folio(&l_hold); list_del(&folio->lru); if (unlikely(!folio_evictable(folio))) { folio_putback_lru(folio); continue; } if (unlikely(buffer_heads_over_limit)) { if (folio_needs_release(folio) && folio_trylock(folio)) { filemap_release_folio(folio, 0); folio_unlock(folio); } } /* Referenced or rmap lock contention: rotate */ if (folio_referenced(folio, 0, sc->target_mem_cgroup, &vm_flags) != 0) { /* * Identify referenced, file-backed active folios and * give them one more trip around the active list. So * that executable code get better chances to stay in * memory under moderate memory pressure. Anon folios * are not likely to be evicted by use-once streaming * IO, plus JVM can create lots of anon VM_EXEC folios, * so we ignore them here. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { nr_rotated += folio_nr_pages(folio); list_add(&folio->lru, &l_active); continue; } } folio_clear_active(folio); /* we are de-activating */ folio_set_workingset(folio); list_add(&folio->lru, &l_inactive); } /* * Move folios back to the lru list. */ spin_lock_irq(&lruvec->lru_lock); nr_activate = move_folios_to_lru(lruvec, &l_active); nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); if (nr_rotated) lru_note_cost(lruvec, file, 0, nr_rotated); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, bool ignore_references) { struct reclaim_stat dummy_stat; unsigned int nr_reclaimed; struct folio *folio; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, .no_demotion = 1, }; nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } return nr_reclaimed; } unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references) { int nid; unsigned int nr_reclaimed = 0; LIST_HEAD(node_folio_list); unsigned int noreclaim_flag; if (list_empty(folio_list)) return nr_reclaimed; noreclaim_flag = memalloc_noreclaim_save(); nid = folio_nid(lru_to_folio(folio_list)); do { struct folio *folio = lru_to_folio(folio_list); if (nid == folio_nid(folio)) { folio_clear_active(folio); list_move(&folio->lru, &node_folio_list); continue; } nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references); memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { if (sc->may_deactivate & (1 << is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); else sc->skipped_deactivate = 1; return 0; } return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } /* * The inactive anon list should be small enough that the VM never has * to do too much work. * * The inactive file list should be small enough to leave most memory * to the established workingset on the scan-resistant active list, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB */ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; unsigned long gb; inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); else inactive_ratio = 1; return inactive * inactive_ratio < active; } enum scan_balance { SCAN_EQUAL, SCAN_FRACT, SCAN_ANON, SCAN_FILE, }; static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; struct lruvec *target_lruvec; if (lru_gen_enabled()) return; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); /* * Flush the memory cgroup stats, so that we read accurate per-memcg * lruvec stats for heuristics. */ mem_cgroup_flush_stats(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. */ spin_lock_irq(&target_lruvec->lru_lock); sc->anon_cost = target_lruvec->anon_cost; sc->file_cost = target_lruvec->file_cost; spin_unlock_irq(&target_lruvec->lru_lock); /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. */ if (!sc->force_deactivate) { unsigned long refaults; /* * When refaults are being observed, it means a new * workingset is being established. Deactivate to get * rid of any stale active pages quickly. */ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) sc->may_deactivate |= DEACTIVATE_ANON; else sc->may_deactivate &= ~DEACTIVATE_ANON; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) sc->may_deactivate |= DEACTIVATE_FILE; else sc->may_deactivate &= ~DEACTIVATE_FILE; } else sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; /* * If we have plenty of inactive file pages that aren't * thrashing, try to reclaim those first before touching * anonymous pages. */ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) && !sc->no_cache_trim_mode) sc->cache_trim_mode = 1; else sc->cache_trim_mode = 0; /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip * the scan balance towards the file LRU. And as the file LRU * shrinks, so does the window for rotation from references. * This means we have a runaway feedback loop where a tiny * thrashing file LRU becomes infinitely more attractive than * anon pages. Try to detect this based on file LRU size. */ if (!cgroup_reclaim(sc)) { unsigned long total_high_wmark = 0; unsigned long free, anon; int z; free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); file = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_FILE); for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = &pgdat->node_zones[z]; if (!managed_zone(zone)) continue; total_high_wmark += high_wmark_pages(zone); } /* * Consider anon: if that's low too, this isn't a * runaway file reclaim problem, but rather just * extreme pressure. Reclaim as per usual then. */ anon = node_page_state(pgdat, NR_INACTIVE_ANON); sc->file_is_tiny = file + free <= total_high_wmark && !(sc->may_deactivate & DEACTIVATE_ANON) && anon >> sc->priority; } } /* * Determine how aggressively the anon and file LRU lists should be * scanned. * * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ enum scan_balance scan_balance; unsigned long ap, fp; enum lru_list lru; /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } /* * Global reclaim will swap to prevent OOM even with no * swappiness, but memcg users want to use this knob to * disable swapping for individual groups completely when * using the memory controller's swap limit feature would be * too expensive. */ if (cgroup_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } /* * Do not apply any pressure balancing cleverness when the * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } /* * If the system is almost out of file pages, force-scan anon. */ if (sc->file_is_tiny) { scan_balance = SCAN_ANON; goto out; } /* * If there is enough inactive page cache, we do not reclaim * anything from the anonymous working right now. */ if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } scan_balance = SCAN_FRACT; /* * Calculate the pressure balance between anon and file pages. * * The amount of pressure we put on each LRU is inversely * proportional to the cost of reclaiming each list, as * determined by the share of pages that are refaulting, times * the relative IO cost of bringing back a swapped out * anonymous page vs reloading a filesystem page (swappiness). * * Although we limit that influence to ensure no list gets * left behind completely: at least a third of the pressure is * applied, before swappiness. * * With swappiness at 100, anon and file have equal IO cost. */ total_cost = sc->anon_cost + sc->file_cost; anon_cost = total_cost + sc->anon_cost; file_cost = total_cost + sc->file_cost; total_cost = anon_cost + file_cost; ap = swappiness * (total_cost + 1); ap /= anon_cost + 1; fp = (200 - swappiness) * (total_cost + 1); fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; denominator = ap + fp; out: for_each_evictable_lru(lru) { bool file = is_file_lru(lru); unsigned long lruvec_size; unsigned long low, min; unsigned long scan; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); mem_cgroup_protection(sc->target_mem_cgroup, memcg, &min, &low); if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min * setting. * * This is important, as otherwise scanning aggression * becomes extremely binary -- from nothing as we * approach the memory protection threshold, to totally * nominal as we exceed it. This results in requiring * setting extremely liberal protection thresholds. It * also means we simply get no protection at all if we * set it too low, which is not ideal. * * If there is any protection in place, we reduce scan * pressure by how much of the total memory used is * within protection thresholds. * * There is one special case: in the first reclaim pass, * we skip over all groups that are within their low * protection. If that fails to reclaim enough pages to * satisfy the reclaim goal, we come back and override * the best-effort low protection. However, we still * ideally want to honor how well-behaved groups are in * that case instead of simply punishing them all * equally. As such, we reclaim them based on how much * memory they are using, reducing the scan pressure * again by how much of the total memory used is under * hard protection. */ unsigned long cgroup_size = mem_cgroup_size(memcg); unsigned long protection; /* memory.low scaling, make sure we retry before OOM */ if (!sc->memcg_low_reclaim && low > min) { protection = low; sc->memcg_low_skipped = 1; } else { protection = min; } /* Avoid TOCTOU with earlier protection check */ cgroup_size = max(cgroup_size, protection); scan = lruvec_size - lruvec_size * protection / (cgroup_size + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable. */ scan = max(scan, SWAP_CLUSTER_MAX); } else { scan = lruvec_size; } scan >>= sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: /* Scan lists relative to size */ break; case SCAN_FRACT: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. * Make sure we don't miss the last page on * the offlined memory cgroups because of a * round-off error. */ scan = mem_cgroup_online(memcg) ? div64_u64(scan * fraction[file], denominator) : DIV64_U64_ROUND_UP(scan * fraction[file], denominator); break; case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) scan = 0; break; default: /* Look ma, no brain */ BUG(); } nr[lru] = scan; } } /* * Anonymous LRU management is a waste if there is * ultimately no way to reclaim the memory. */ static bool can_age_anon_pages(struct pglist_data *pgdat, struct scan_control *sc) { /* Aging the anon LRU is valuable if swap is present: */ if (total_swap_pages > 0) return true; /* Also valuable if anon pages can be demoted: */ return can_demote(pgdat->node_id, sc); } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) #else DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) #endif static bool should_walk_mmu(void) { return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK); } static bool should_clear_pmd_young(void) { return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG); } /****************************************************************************** * shorthand helpers ******************************************************************************/ #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) #define DEFINE_MAX_SEQ(lruvec) \ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) #define DEFINE_MIN_SEQ(lruvec) \ unsigned long min_seq[ANON_AND_FILE] = { \ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) { struct pglist_data *pgdat = NODE_DATA(nid); #ifdef CONFIG_MEMCG if (memcg) { struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; /* see the comment in mem_cgroup_lruvec() */ if (!lruvec->pgdat) lruvec->pgdat = pgdat; return lruvec; } #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); return &pgdat->__lruvec; } static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) { struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); if (!sc->may_swap) return 0; if (!can_demote(pgdat->node_id, sc) && mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) return 0; return mem_cgroup_swappiness(memcg); } static int get_nr_gens(struct lruvec *lruvec, int type) { return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; } static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { /* see the comment on lru_gen_folio */ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; } /****************************************************************************** * Bloom filters ******************************************************************************/ /* * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of * bits in a bitmap, k is the number of hash functions and n is the number of * inserted items. * * Page table walkers use one of the two filters to reduce their search space. * To get rid of non-leaf entries that no longer have enough leaf entries, the * aging uses the double-buffering technique to flip to the other filter each * time it produces a new generation. For non-leaf entries that have enough * leaf entries, the aging carries them over to the next generation in * walk_pmd_range(); the eviction also report them when walking the rmap * in lru_gen_look_around(). * * For future optimizations: * 1. It's not necessary to keep both filters all the time. The spare one can be * freed after the RCU grace period and reallocated if needed again. * 2. And when reallocating, it's worth scaling its size according to the number * of inserted entries in the other filter, to reduce the memory overhead on * small systems and false positives on large systems. * 3. Jenkins' hash function is an alternative to Knuth's. */ #define BLOOM_FILTER_SHIFT 15 static inline int filter_gen_from_seq(unsigned long seq) { return seq % NR_BLOOM_FILTERS; } static void get_item_key(void *item, int *key) { u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); key[1] = hash >> BLOOM_FILTER_SHIFT; } static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return true; get_item_key(item, key); return test_bit(key[0], filter) && test_bit(key[1], filter); } static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return; get_item_key(item, key); if (!test_bit(key[0], filter)) set_bit(key[0], filter); if (!test_bit(key[1], filter)) set_bit(key[1], filter); } static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) { unsigned long *filter; int gen = filter_gen_from_seq(seq); filter = mm_state->filters[gen]; if (filter) { bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); return; } filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); WRITE_ONCE(mm_state->filters[gen], filter); } /****************************************************************************** * mm_struct list ******************************************************************************/ #ifdef CONFIG_LRU_GEN_WALKS_MMU static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { static struct lru_gen_mm_list mm_list = { .fifo = LIST_HEAD_INIT(mm_list.fifo), .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), }; #ifdef CONFIG_MEMCG if (memcg) return &memcg->mm_list; #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); return &mm_list; } static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) { return &lruvec->mm_state; } static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) { int key; struct mm_struct *mm; struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) return NULL; clear_bit(key, &mm->lru_gen.bitmap); return mmget_not_zero(mm) ? mm : NULL; } void lru_gen_add_mm(struct mm_struct *mm) { int nid; struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); #ifdef CONFIG_MEMCG VM_WARN_ON_ONCE(mm->lru_gen.memcg); mm->lru_gen.memcg = memcg; #endif spin_lock(&mm_list->lock); for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* the first addition since the last iteration */ if (mm_state->tail == &mm_list->fifo) mm_state->tail = &mm->lru_gen.list; } list_add_tail(&mm->lru_gen.list, &mm_list->fifo); spin_unlock(&mm_list->lock); } void lru_gen_del_mm(struct mm_struct *mm) { int nid; struct lru_gen_mm_list *mm_list; struct mem_cgroup *memcg = NULL; if (list_empty(&mm->lru_gen.list)) return; #ifdef CONFIG_MEMCG memcg = mm->lru_gen.memcg; #endif mm_list = get_mm_list(memcg); spin_lock(&mm_list->lock); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* where the current iteration continues after */ if (mm_state->head == &mm->lru_gen.list) mm_state->head = mm_state->head->prev; /* where the last iteration ended before */ if (mm_state->tail == &mm->lru_gen.list) mm_state->tail = mm_state->tail->next; } list_del_init(&mm->lru_gen.list); spin_unlock(&mm_list->lock); #ifdef CONFIG_MEMCG mem_cgroup_put(mm->lru_gen.memcg); mm->lru_gen.memcg = NULL; #endif } #ifdef CONFIG_MEMCG void lru_gen_migrate_mm(struct mm_struct *mm) { struct mem_cgroup *memcg; struct task_struct *task = rcu_dereference_protected(mm->owner, true); VM_WARN_ON_ONCE(task->mm != mm); lockdep_assert_held(&task->alloc_lock); /* for mm_update_next_owner() */ if (mem_cgroup_disabled()) return; /* migration can happen before addition */ if (!mm->lru_gen.memcg) return; rcu_read_lock(); memcg = mem_cgroup_from_task(task); rcu_read_unlock(); if (memcg == mm->lru_gen.memcg) return; VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); lru_gen_del_mm(mm); lru_gen_add_mm(mm); } #endif #else /* !CONFIG_LRU_GEN_WALKS_MMU */ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { return NULL; } static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) { return NULL; } static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) { return NULL; } #endif static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last) { int i; int hist; struct lruvec *lruvec = walk->lruvec; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); hist = lru_hist_from_seq(walk->seq); for (i = 0; i < NR_MM_STATS; i++) { WRITE_ONCE(mm_state->stats[hist][i], mm_state->stats[hist][i] + walk->mm_stats[i]); walk->mm_stats[i] = 0; } if (NR_HIST_GENS > 1 && last) { hist = lru_hist_from_seq(walk->seq + 1); for (i = 0; i < NR_MM_STATS; i++) WRITE_ONCE(mm_state->stats[hist][i], 0); } } static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter) { bool first = false; bool last = false; struct mm_struct *mm = NULL; struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* * mm_state->seq is incremented after each iteration of mm_list. There * are three interesting cases for this page table walker: * 1. It tries to start a new iteration with a stale max_seq: there is * nothing left to do. * 2. It started the next iteration: it needs to reset the Bloom filter * so that a fresh set of PTE tables can be recorded. * 3. It ended the current iteration: it needs to reset the mm stats * counters and tell its caller to increment max_seq. */ spin_lock(&mm_list->lock); VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq); if (walk->seq <= mm_state->seq) goto done; if (!mm_state->head) mm_state->head = &mm_list->fifo; if (mm_state->head == &mm_list->fifo) first = true; do { mm_state->head = mm_state->head->next; if (mm_state->head == &mm_list->fifo) { WRITE_ONCE(mm_state->seq, mm_state->seq + 1); last = true; break; } /* force scan for those added after the last iteration */ if (!mm_state->tail || mm_state->tail == mm_state->head) { mm_state->tail = mm_state->head->next; walk->force_scan = true; } } while (!(mm = get_next_mm(walk))); done: if (*iter || last) reset_mm_stats(walk, last); spin_unlock(&mm_list->lock); if (mm && first) reset_bloom_filter(mm_state, walk->seq + 1); if (*iter) mmput_async(*iter); *iter = mm; return last; } static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq) { bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); spin_lock(&mm_list->lock); VM_WARN_ON_ONCE(mm_state->seq + 1 < seq); if (seq > mm_state->seq) { mm_state->head = NULL; mm_state->tail = NULL; WRITE_ONCE(mm_state->seq, mm_state->seq + 1); success = true; } spin_unlock(&mm_list->lock); return success; } /****************************************************************************** * PID controller ******************************************************************************/ /* * A feedback loop based on Proportional-Integral-Derivative (PID) controller. * * The P term is refaulted/(evicted+protected) from a tier in the generation * currently being evicted; the I term is the exponential moving average of the * P term over the generations previously evicted, using the smoothing factor * 1/2; the D term isn't supported. * * The setpoint (SP) is always the first tier of one type; the process variable * (PV) is either any tier of the other type or any other tier of the same * type. * * The error is the difference between the SP and the PV; the correction is to * turn off protection when SP>PV or turn on protection when SP<PV. * * For future optimizations: * 1. The D term may discount the other two terms over time so that long-lived * generations can resist stale information. */ struct ctrl_pos { unsigned long refaulted; unsigned long total; int gain; }; static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); pos->refaulted = lrugen->avg_refaulted[type][tier] + atomic_long_read(&lrugen->refaulted[hist][type][tier]); pos->total = lrugen->avg_total[type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); if (tier) pos->total += lrugen->protected[hist][type][tier - 1]; pos->gain = gain; } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) { int hist, tier; struct lru_gen_folio *lrugen = &lruvec->lrugen; bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; lockdep_assert_held(&lruvec->lru_lock); if (!carryover && !clear) return; hist = lru_hist_from_seq(seq); for (tier = 0; tier < MAX_NR_TIERS; tier++) { if (carryover) { unsigned long sum; sum = lrugen->avg_refaulted[type][tier] + atomic_long_read(&lrugen->refaulted[hist][type][tier]); WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); if (tier) sum += lrugen->protected[hist][type][tier - 1]; WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); if (tier) WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); } } } static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) { /* * Return true if the PV has a limited number of refaults or a lower * refaulted/total than the SP. */ return pv->refaulted < MIN_LRU_BATCH || pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= (sp->refaulted + 1) * pv->total * pv->gain; } /****************************************************************************** * the aging ******************************************************************************/ /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(!rcu_read_lock_held()); do { /* lru_gen_del_folio() has isolated this page? */ if (!(old_flags & LRU_GEN_MASK)) { /* for shrink_folio_list() */ new_flags = old_flags | BIT(PG_referenced); continue; } new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } /* protect pages accessed multiple times through file descriptors */ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); unsigned long new_flags, old_flags = READ_ONCE(folio->flags); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); do { new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; /* folio_update_gen() has promoted this page? */ if (new_gen >= 0 && new_gen != old_gen) return new_gen; new_gen = (old_gen + 1) % MAX_NR_GENS; new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); return new_gen; } static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); walk->batched++; walk->nr_pages[old_gen][type][zone] -= delta; walk->nr_pages[new_gen][type][zone] += delta; } static void reset_batch_size(struct lru_gen_mm_walk *walk) { int gen, type, zone; struct lruvec *lruvec = walk->lruvec; struct lru_gen_folio *lrugen = &lruvec->lrugen; walk->batched = 0; for_each_gen_type_zone(gen, type, zone) { enum lru_list lru = type * LRU_INACTIVE_FILE; int delta = walk->nr_pages[gen][type][zone]; if (!delta) continue; walk->nr_pages[gen][type][zone] = 0; WRITE_ONCE(lrugen->nr_pages[gen][type][zone], lrugen->nr_pages[gen][type][zone] + delta); if (lru_gen_is_active(lruvec, gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); } } static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) { struct address_space *mapping; struct vm_area_struct *vma = args->vma; struct lru_gen_mm_walk *walk = args->private; if (!vma_is_accessible(vma)) return true; if (is_vm_hugetlb_page(vma)) return true; if (!vma_has_recency(vma)) return true; if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) return true; if (vma == get_gate_vma(vma->vm_mm)) return true; if (vma_is_anonymous(vma)) return !walk->can_swap; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; mapping = vma->vm_file->f_mapping; if (mapping_unevictable(mapping)) return true; if (shmem_mapping(mapping)) return !walk->can_swap; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; } /* * Some userspace memory allocators map many single-page VMAs. Instead of * returning back to the PGD table for each of such VMAs, finish an entire PMD * table to reduce zigzags and improve cache performance. */ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, unsigned long *vm_start, unsigned long *vm_end) { unsigned long start = round_up(*vm_end, size); unsigned long end = (start | ~mask) + 1; VMA_ITERATOR(vmi, args->mm, start); VM_WARN_ON_ONCE(mask & size); VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); for_each_vma(vmi, args->vma) { if (end && end <= args->vma->vm_start) return false; if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) continue; *vm_start = max(start, args->vma->vm_start); *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; return true; } return false; } static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) { unsigned long pfn = pte_pfn(pte); VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); if (!pte_present(pte) || is_zero_pfn(pfn)) return -1; if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) return -1; if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; return pfn; } static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) { unsigned long pfn = pmd_pfn(pmd); VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) return -1; if (WARN_ON_ONCE(pmd_devmap(pmd))) return -1; if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; return pfn; } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, struct pglist_data *pgdat, bool can_swap) { struct folio *folio; /* try to avoid unnecessary memory loads */ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) return NULL; folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; if (folio_memcg_rcu(folio) != memcg) return NULL; /* file VMAs can contain anon pages from COW */ if (!folio_is_file_lru(folio) && !can_swap) return NULL; return folio; } static bool suitable_to_scan(int total, int young) { int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); /* suitable if the average number of young PTEs per cacheline is >=1 */ return young * n >= total; } static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); if (!pte) return false; if (!spin_trylock(ptl)) { pte_unmap(pte); return false; } arch_enter_lazy_mmu_mode(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; struct folio *folio; pte_t ptent = ptep_get(pte + i); total++; walk->mm_stats[MM_LEAF_TOTAL]++; pfn = get_pte_pfn(ptent, args->vma, addr); if (pfn == -1) continue; if (!pte_young(ptent)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) continue; if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) VM_WARN_ON_ONCE(true); young++; walk->mm_stats[MM_LEAF_YOUNG]++; if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); old_gen = folio_update_gen(folio, new_gen); if (old_gen >= 0 && old_gen != new_gen) update_batch_size(walk, folio, old_gen, new_gen); } if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte, ptl); return suitable_to_scan(total, young); } static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; pmd_t *pmd; spinlock_t *ptl; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ if (*first == -1) { *first = addr; bitmap_zero(bitmap, MIN_LRU_BATCH); return; } i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); if (i && i <= MIN_LRU_BATCH) { __set_bit(i - 1, bitmap); return; } pmd = pmd_offset(pud, *first); ptl = pmd_lockptr(args->mm, pmd); if (!spin_trylock(ptl)) goto done; arch_enter_lazy_mmu_mode(); do { unsigned long pfn; struct folio *folio; /* don't round down the first address */ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; pfn = get_pmd_pfn(pmd[i], vma, addr); if (pfn == -1) goto next; if (!pmd_trans_huge(pmd[i])) { if (should_clear_pmd_young()) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) goto next; if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) goto next; walk->mm_stats[MM_LEAF_YOUNG]++; if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); old_gen = folio_update_gen(folio, new_gen); if (old_gen >= 0 && old_gen != new_gen) update_batch_size(walk, folio, old_gen, new_gen); next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: *first = -1; } static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, struct mm_walk *args) { int i; pmd_t *pmd; unsigned long next; unsigned long addr; struct vm_area_struct *vma; DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); VM_WARN_ON_ONCE(pud_leaf(*pud)); /* * Finish an entire PMD in two passes: the first only reaches to PTE * tables to avoid taking the PMD lock; the second, if necessary, takes * the PMD lock to clear the accessed bit in PMD entries. */ pmd = pmd_offset(pud, start & PUD_MASK); restart: /* walk_pte_range() may call get_next_vma() */ vma = args->vma; for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { pmd_t val = pmdp_get_lockless(pmd + i); next = pmd_addr_end(addr, end); if (!pmd_present(val) || is_huge_zero_pmd(val)) { walk->mm_stats[MM_LEAF_TOTAL]++; continue; } if (pmd_trans_huge(val)) { unsigned long pfn = pmd_pfn(val); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); walk->mm_stats[MM_LEAF_TOTAL]++; if (!pmd_young(val)) { walk->mm_stats[MM_LEAF_OLD]++; continue; } /* try to avoid unnecessary memory loads */ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) continue; walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } walk->mm_stats[MM_NONLEAF_TOTAL]++; if (should_clear_pmd_young()) { if (!pmd_young(val)) continue; walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; if (!walk_pte_range(&val, addr, next, args)) continue; walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ update_bloom_filter(mm_state, walk->seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) goto restart; } static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, struct mm_walk *args) { int i; pud_t *pud; unsigned long addr; unsigned long next; struct lru_gen_mm_walk *walk = args->private; VM_WARN_ON_ONCE(p4d_leaf(*p4d)); pud = pud_offset(p4d, start & P4D_MASK); restart: for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { pud_t val = READ_ONCE(pud[i]); next = pud_addr_end(addr, end); if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) continue; walk_pmd_range(&val, addr, next, args); if (need_resched() || walk->batched >= MAX_LRU_BATCH) { end = (addr | ~PUD_MASK) + 1; goto done; } } if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) goto restart; end = round_up(end, P4D_SIZE); done: if (!end || !args->vma) return 1; walk->next_addr = max(end, args->vma->vm_start); return -EAGAIN; } static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) { static const struct mm_walk_ops mm_walk_ops = { .test_walk = should_skip_vma, .p4d_entry = walk_pud_range, .walk_lock = PGWALK_RDLOCK, }; int err; struct lruvec *lruvec = walk->lruvec; struct mem_cgroup *memcg = lruvec_memcg(lruvec); walk->next_addr = FIRST_USER_ADDRESS; do { DEFINE_MAX_SEQ(lruvec); err = -EBUSY; /* another thread might have called inc_max_seq() */ if (walk->seq != max_seq) break; /* folio_update_gen() requires stable folio_memcg() */ if (!mem_cgroup_trylock_pages(memcg)) break; /* the caller might be holding the lock for write */ if (mmap_read_trylock(mm)) { err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); mmap_read_unlock(mm); } mem_cgroup_unlock_pages(); if (walk->batched) { spin_lock_irq(&lruvec->lru_lock); reset_batch_size(walk); spin_unlock_irq(&lruvec->lru_lock); } cond_resched(); } while (err == -EAGAIN); } static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; if (pgdat && current_is_kswapd()) { VM_WARN_ON_ONCE(walk); walk = &pgdat->mm_walk; } else if (!walk && force_alloc) { VM_WARN_ON_ONCE(current_is_kswapd()); walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); } current->reclaim_state->mm_walk = walk; return walk; } static void clear_mm_walk(void) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); current->reclaim_state->mm_walk = NULL; if (!current_is_kswapd()) kfree(walk); } static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); if (type == LRU_GEN_ANON && !can_swap) goto done; /* prevent cold/hot inversion if force_scan is true */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); if (!--remaining) return false; } } done: reset_ctrl_pos(lruvec, type, true); WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); return true; } static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) { int gen, type, zone; bool success = false; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ for (type = !can_swap; type < ANON_AND_FILE; type++) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); for (zone = 0; zone < MAX_NR_ZONES; zone++) { if (!list_empty(&lrugen->folios[gen][type][zone])) goto next; } min_seq[type]++; } next: ; } /* see the comment on lru_gen_folio */ if (can_swap) { min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); } for (type = !can_swap; type < ANON_AND_FILE; type++) { if (min_seq[type] == lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); success = true; } return success; } static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { bool success; int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: if (seq < READ_ONCE(lrugen->max_seq)) return false; spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); success = seq == lrugen->max_seq; if (!success) goto unlock; for (type = ANON_AND_FILE - 1; type >= 0; type--) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); if (inc_min_seq(lruvec, type, can_swap)) continue; spin_unlock_irq(&lruvec->lru_lock); cond_resched(); goto restart; } /* * Update the active/inactive LRU sizes for compatibility. Both sides of * the current max_seq need to be covered, since max_seq+1 can overlap * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do * overlap, cold/hot inversion happens. */ prev = lru_gen_from_seq(lrugen->max_seq - 1); next = lru_gen_from_seq(lrugen->max_seq + 1); for (type = 0; type < ANON_AND_FILE; type++) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { enum lru_list lru = type * LRU_INACTIVE_FILE; long delta = lrugen->nr_pages[prev][type][zone] - lrugen->nr_pages[next][type][zone]; if (!delta) continue; __update_lru_size(lruvec, lru, zone, delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); } } for (type = 0; type < ANON_AND_FILE; type++) reset_ctrl_pos(lruvec, type, false); WRITE_ONCE(lrugen->timestamps[next], jiffies); /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); unlock: spin_unlock_irq(&lruvec->lru_lock); return success; } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) return inc_max_seq(lruvec, seq, can_swap, force_scan); /* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) return false; /* * If the hardware doesn't automatically set the accessed bit, fallback * to lru_gen_look_around(), which only clears the accessed bit in a * handful of PTEs. Spreading the work out over a period of time usually * is less efficient, but it avoids bursty page faults. */ if (!should_walk_mmu()) { success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk = set_mm_walk(NULL, true); if (!walk) { success = iterate_mm_list_nowalk(lruvec, seq); goto done; } walk->lruvec = lruvec; walk->seq = seq; walk->can_swap = can_swap; walk->force_scan = force_scan; do { success = iterate_mm_list(walk, &mm); if (mm) walk_mm(mm, walk); } while (mm); done: if (success) { success = inc_max_seq(lruvec, seq, can_swap, force_scan); WARN_ON_ONCE(!success); } return success; } /****************************************************************************** * working set protection ******************************************************************************/ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; bool can_swap = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); for (type = !can_swap; type < ANON_AND_FILE; type++) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); } } /* whether the size is big enough to be helpful */ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total; } static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) { int gen; unsigned long birth; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); /* see the comment on lru_gen_folio */ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); if (time_is_after_jiffies(birth + min_ttl)) return false; if (!lruvec_is_sizable(lruvec, sc)) return false; mem_cgroup_calculate_protection(NULL, memcg); return !mem_cgroup_below_min(NULL, memcg); } /* to protect the working set of the last N jiffies */ static unsigned long lru_gen_min_ttl __read_mostly; static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); VM_WARN_ON_ONCE(!current_is_kswapd()); /* check the order to exclude compaction-induced reclaim */ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY) return; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) { mem_cgroup_iter_break(NULL, memcg); return; } cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); /* * The main goal is to OOM kill if every generation from all memcgs is * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min. */ if (mutex_trylock(&oom_lock)) { struct oom_control oc = { .gfp_mask = sc->gfp_mask, }; out_of_memory(&oc); mutex_unlock(&oom_lock); } } /****************************************************************************** * rmap/PT walk feedback ******************************************************************************/ /* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If * the scan was done cacheline efficiently, it adds the PMD entry pointing to * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; int young = 0; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); if (spin_is_contended(pvmw->ptl)) return; /* exclude special VMAs containing anon pages from COW */ if (vma->vm_flags & VM_SPECIAL) return; /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; start = max(addr & PMD_MASK, vma->vm_start); end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) start = end - MIN_LRU_BATCH * PAGE_SIZE; else { start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; } } /* folio_update_gen() requires stable folio_memcg() */ if (!mem_cgroup_trylock_pages(memcg)) return; arch_enter_lazy_mmu_mode(); pte -= (addr - start) / PAGE_SIZE; for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; pte_t ptent = ptep_get(pte + i); pfn = get_pte_pfn(ptent, vma, addr); if (pfn == -1) continue; if (!pte_young(ptent)) continue; folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); if (!folio) continue; if (!ptep_test_and_clear_young(vma, addr, pte + i)) VM_WARN_ON_ONCE(true); young++; if (pte_dirty(ptent) && !folio_test_dirty(folio) && !(folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio))) folio_mark_dirty(folio); if (walk) { old_gen = folio_update_gen(folio, new_gen); if (old_gen >= 0 && old_gen != new_gen) update_batch_size(walk, folio, old_gen, new_gen); continue; } old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); else if (old_gen != new_gen) folio_activate(folio); } arch_leave_lazy_mmu_mode(); mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); } /****************************************************************************** * memcg LRU ******************************************************************************/ /* see the comment on MEMCG_NR_GENS */ enum { MEMCG_LRU_NOP, MEMCG_LRU_HEAD, MEMCG_LRU_TAIL, MEMCG_LRU_OLD, MEMCG_LRU_YOUNG, }; static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) { int seg; int old, new; unsigned long flags; int bin = get_random_u32_below(MEMCG_NR_BINS); struct pglist_data *pgdat = lruvec_pgdat(lruvec); spin_lock_irqsave(&pgdat->memcg_lru.lock, flags); VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); seg = 0; new = old = lruvec->lrugen.gen; /* see the comment on MEMCG_NR_GENS */ if (op == MEMCG_LRU_HEAD) seg = MEMCG_LRU_HEAD; else if (op == MEMCG_LRU_TAIL) seg = MEMCG_LRU_TAIL; else if (op == MEMCG_LRU_OLD) new = get_memcg_gen(pgdat->memcg_lru.seq); else if (op == MEMCG_LRU_YOUNG) new = get_memcg_gen(pgdat->memcg_lru.seq + 1); else VM_WARN_ON_ONCE(true); WRITE_ONCE(lruvec->lrugen.seg, seg); WRITE_ONCE(lruvec->lrugen.gen, new); hlist_nulls_del_rcu(&lruvec->lrugen.list); if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); else hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); pgdat->memcg_lru.nr_memcgs[old]--; pgdat->memcg_lru.nr_memcgs[new]++; if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags); } #ifdef CONFIG_MEMCG void lru_gen_online_memcg(struct mem_cgroup *memcg) { int gen; int nid; int bin = get_random_u32_below(MEMCG_NR_BINS); for_each_node(nid) { struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); spin_lock_irq(&pgdat->memcg_lru.lock); VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); gen = get_memcg_gen(pgdat->memcg_lru.seq); lruvec->lrugen.gen = gen; hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); pgdat->memcg_lru.nr_memcgs[gen]++; spin_unlock_irq(&pgdat->memcg_lru.lock); } } void lru_gen_offline_memcg(struct mem_cgroup *memcg) { int nid; for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); } } void lru_gen_release_memcg(struct mem_cgroup *memcg) { int gen; int nid; for_each_node(nid) { struct pglist_data *pgdat = NODE_DATA(nid); struct lruvec *lruvec = get_lruvec(memcg, nid); spin_lock_irq(&pgdat->memcg_lru.lock); if (hlist_nulls_unhashed(&lruvec->lrugen.list)) goto unlock; gen = lruvec->lrugen.gen; hlist_nulls_del_init_rcu(&lruvec->lrugen.list); pgdat->memcg_lru.nr_memcgs[gen]--; if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); unlock: spin_unlock_irq(&pgdat->memcg_lru.lock); } } void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); /* see the comment on MEMCG_NR_GENS */ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); } #endif /* CONFIG_MEMCG */ /****************************************************************************** * the eviction ******************************************************************************/ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, int tier_idx) { bool success; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); /* unevictable */ if (!folio_evictable(folio)) { success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); folio_set_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGCULLED, delta); return true; } /* dirty lazyfree */ if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); folio_set_swapbacked(folio); lruvec_add_folio_tail(lruvec, folio); return true; } /* promoted */ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } /* protected */ if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { int hist = lru_hist_from_seq(lrugen->min_seq[type]); gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); WRITE_ONCE(lrugen->protected[hist][type][tier - 1], lrugen->protected[hist][type][tier - 1] + delta); return true; } /* ineligible */ if (zone > sc->reclaim_idx || skip_cma(folio, sc)) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } /* waiting for writeback */ if (folio_test_locked(folio) || folio_test_writeback(folio) || (type == LRU_GEN_FILE && folio_test_dirty(folio))) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } return false; } static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) { bool success; /* swap constrained */ if (!(sc->gfp_mask & __GFP_IO) && (folio_test_dirty(folio) || (folio_test_anon(folio) && !folio_test_swapcache(folio)))) return false; /* raced with release_pages() */ if (!folio_try_get(folio)) return false; /* raced with another isolation */ if (!folio_test_clear_lru(folio)) { folio_put(folio); return false; } /* see the comment on MAX_NR_TIERS */ if (!folio_test_referenced(folio)) set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); return true; } static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int type, int tier, struct list_head *list) { int i; int gen; enum vm_event_item item; int sorted = 0; int scanned = 0; int isolated = 0; int skipped = 0; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); VM_WARN_ON_ONCE(!list_empty(list)); if (get_nr_gens(lruvec, type) == MIN_NR_GENS) return 0; gen = lru_gen_from_seq(lrugen->min_seq[type]); for (i = MAX_NR_ZONES; i > 0; i--) { LIST_HEAD(moved); int skipped_zone = 0; int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); int delta = folio_nr_pages(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); scanned += delta; if (sort_folio(lruvec, folio, sc, tier)) sorted += delta; else if (isolate_folio(lruvec, folio, sc)) { list_add(&folio->lru, list); isolated += delta; } else { list_move(&folio->lru, &moved); skipped_zone += delta; } if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH) break; } if (skipped_zone) { list_splice(&moved, head); __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone); skipped += skipped_zone; } if (!remaining || isolated >= MIN_LRU_BATCH) break; } item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); } __count_memcg_events(memcg, item, isolated); __count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); /* * There might not be eligible folios due to reclaim_idx. Check the * remaining to prevent livelock if it's not making progress. */ return isolated || !remaining ? scanned : 0; } static int get_tier_idx(struct lruvec *lruvec, int type) { int tier; struct ctrl_pos sp, pv; /* * To leave a margin for fluctuations, use a larger gain factor (1:2). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ read_ctrl_pos(lruvec, type, 0, 1, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { read_ctrl_pos(lruvec, type, tier, 2, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } return tier - 1; } static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) { int type, tier; struct ctrl_pos sp, pv; int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; /* * Compare the first tier of anon with that of file to determine which * type to scan. Also need to compare other tiers of the selected type * with the first tier of the other type to determine the last tier (of * the selected type) to evict. */ read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); type = positive_ctrl_err(&sp, &pv); read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { read_ctrl_pos(lruvec, type, tier, gain[type], &pv); if (!positive_ctrl_err(&sp, &pv)) break; } *tier_idx = tier - 1; return type; } static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; int type; int scanned; int tier = -1; DEFINE_MIN_SEQ(lruvec); /* * Try to make the obvious choice first, and if anon and file are both * available from the same generation, * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon * first. * 2. If !__GFP_IO, file first since clean pagecache is more likely to * exist than clean swapcache. */ if (!swappiness) type = LRU_GEN_FILE; else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) type = LRU_GEN_ANON; else if (swappiness == 1) type = LRU_GEN_FILE; else if (swappiness == 200) type = LRU_GEN_ANON; else if (!(sc->gfp_mask & __GFP_IO)) type = LRU_GEN_FILE; else type = get_type_to_scan(lruvec, swappiness, &tier); for (i = !swappiness; i < ANON_AND_FILE; i++) { if (tier < 0) tier = get_tier_idx(lruvec, type); scanned = scan_folios(lruvec, sc, type, tier, list); if (scanned) break; type = !type; tier = -1; } *type_scanned = type; return scanned; } static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { int type; int scanned; int reclaimed; LIST_HEAD(list); LIST_HEAD(clean); struct folio *folio; struct folio *next; enum vm_event_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); spin_lock_irq(&lruvec->lru_lock); scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); scanned += try_to_inc_min_seq(lruvec, swappiness); if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); if (list_empty(&list)) return scanned; retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, scanned, reclaimed, &stat, sc->priority, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } if (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio))) { /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ if (folio_test_workingset(folio)) folio_set_referenced(folio); continue; } if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || folio_mapped(folio) || folio_test_locked(folio) || folio_test_dirty(folio) || folio_test_writeback(folio)) { /* don't add rejected folios to the oldest generation */ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, BIT(PG_active)); continue; } /* retry folios that may have missed folio_rotate_reclaimable() */ list_move(&folio->lru, &clean); sc->nr_scanned -= folio_nr_pages(folio); } spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) { walk->lruvec = lruvec; reset_batch_size(walk); } item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); list_splice_init(&clean, &list); if (!list_empty(&list)) { skip_retry = true; goto retry; } return scanned; } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, bool can_swap, unsigned long *nr_to_scan) { int gen, type, zone; unsigned long old = 0; unsigned long young = 0; unsigned long total = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); /* whether this lruvec is completely out of cold folios */ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { *nr_to_scan = 0; return true; } for (type = !can_swap; type < ANON_AND_FILE; type++) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { unsigned long size = 0; gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); total += size; if (seq == max_seq) young += size; else if (seq + MIN_NR_GENS == max_seq) old += size; } } *nr_to_scan = total; /* * The aging tries to be lazy to reduce the overhead, while the eviction * stalls when the number of generations reaches MIN_NR_GENS. Hence, the * ideal number of generations is MIN_NR_GENS+1. */ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) return false; /* * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) * of the total number of pages for each generation. A reasonable range * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The * aging cares about the upper bound of hot pages, while the eviction * cares about the lower bound of cold pages. */ if (young * MIN_NR_GENS > total) return true; if (old * (MIN_NR_GENS + 2) < total) return true; return false; } /* * For future optimizations: * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) { bool success; unsigned long nr_to_scan; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) return nr_to_scan; /* try to get away with not aging at the default priority */ if (!success || sc->priority == DEF_PRIORITY) return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) { int i; enum zone_watermarks mark; /* don't abort memcg reclaim to ensure fairness */ if (!root_reclaim(sc)) return false; if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) return true; /* check the order to exclude compaction-induced reclaim */ if (!current_is_kswapd() || sc->order) return false; mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ? WMARK_PROMO : WMARK_HIGH; for (i = 0; i <= sc->reclaim_idx; i++) { struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH; if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) return false; } /* kswapd should abort if all eligible zones are safe */ return true; } static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { long nr_to_scan; unsigned long scanned = 0; int swappiness = get_swappiness(lruvec, sc); while (true) { int delta; nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); if (nr_to_scan <= 0) break; delta = evict_folios(lruvec, sc, swappiness); if (!delta) break; scanned += delta; if (scanned >= nr_to_scan) break; if (should_abort_scan(lruvec, sc)) break; cond_resched(); } /* whether this lruvec should be rotated */ return nr_to_scan < 0; } static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) { bool success; unsigned long scanned = sc->nr_scanned; unsigned long reclaimed = sc->nr_reclaimed; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); mem_cgroup_calculate_protection(NULL, memcg); if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG; if (mem_cgroup_below_low(NULL, memcg)) { /* see the comment on MEMCG_NR_GENS */ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL) return MEMCG_LRU_TAIL; memcg_memory_event(memcg, MEMCG_LOW); } success = try_to_shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); if (!sc->proactive) vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); flush_reclaim_state(sc); if (success && mem_cgroup_online(memcg)) return MEMCG_LRU_YOUNG; if (!success && lruvec_is_sizable(lruvec, sc)) return 0; /* one retry if offlined or too small */ return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; } static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) { int op; int gen; int bin; int first_bin; struct lruvec *lruvec; struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; struct hlist_nulls_node *pos; gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: op = 0; memcg = NULL; rcu_read_lock(); hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { if (op) { lru_gen_rotate_memcg(lruvec, op); op = 0; } mem_cgroup_put(memcg); memcg = NULL; if (gen != READ_ONCE(lrugen->gen)) continue; lruvec = container_of(lrugen, struct lruvec, lrugen); memcg = lruvec_memcg(lruvec); if (!mem_cgroup_tryget(memcg)) { lru_gen_release_memcg(memcg); memcg = NULL; continue; } rcu_read_unlock(); op = shrink_one(lruvec, sc); rcu_read_lock(); if (should_abort_scan(lruvec, sc)) break; } rcu_read_unlock(); if (op) lru_gen_rotate_memcg(lruvec, op); mem_cgroup_put(memcg); if (!is_a_nulls(pos)) return; /* restart if raced with lru_gen_rotate_memcg() */ if (gen != get_nulls_value(pos)) goto restart; /* try the rest of the bins of the current generation */ bin = get_memcg_bin(bin + 1); if (bin != first_bin) goto restart; } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; VM_WARN_ON_ONCE(root_reclaim(sc)); VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap); lru_add_drain(); blk_start_plug(&plug); set_mm_walk(NULL, sc->proactive); if (try_to_shrink_lruvec(lruvec, sc)) lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG); clear_mm_walk(); blk_finish_plug(&plug); } static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) { int priority; unsigned long reclaimable; if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) return; /* * Determine the initial priority based on * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, * where reclaimed_to_scanned_ratio = inactive / total. */ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); /* round down reclaimable and round up sc->nr_to_reclaim */ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); sc->priority = clamp(priority, 0, DEF_PRIORITY); } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { struct blk_plug plug; unsigned long reclaimed = sc->nr_reclaimed; VM_WARN_ON_ONCE(!root_reclaim(sc)); /* * Unmapped clean folios are already prioritized. Scanning for more of * them is likely futile and can cause high reclaim latency when there * is a large number of memcgs. */ if (!sc->may_writepage || !sc->may_unmap) goto done; lru_add_drain(); blk_start_plug(&plug); set_mm_walk(pgdat, sc->proactive); set_initial_priority(pgdat, sc); if (current_is_kswapd()) sc->nr_reclaimed = 0; if (mem_cgroup_disabled()) shrink_one(&pgdat->__lruvec, sc); else shrink_many(pgdat, sc); if (current_is_kswapd()) sc->nr_reclaimed += reclaimed; clear_mm_walk(); blk_finish_plug(&plug); done: /* kswapd should never fail */ pgdat->kswapd_failures = 0; } /****************************************************************************** * state change ******************************************************************************/ static bool __maybe_unused state_is_valid(struct lruvec *lruvec) { struct lru_gen_folio *lrugen = &lruvec->lrugen; if (lrugen->enabled) { enum lru_list lru; for_each_evictable_lru(lru) { if (!list_empty(&lruvec->lists[lru])) return false; } } else { int gen, type, zone; for_each_gen_type_zone(gen, type, zone) { if (!list_empty(&lrugen->folios[gen][type][zone])) return false; } } return true; } static bool fill_evictable(struct lruvec *lruvec) { enum lru_list lru; int remaining = MAX_LRU_BATCH; for_each_evictable_lru(lru) { int type = is_file_lru(lru); bool active = is_active_lru(lru); struct list_head *head = &lruvec->lists[lru]; while (!list_empty(head)) { bool success; struct folio *folio = lru_to_folio(head); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); lruvec_del_folio(lruvec, folio); success = lru_gen_add_folio(lruvec, folio, false); VM_WARN_ON_ONCE(!success); if (!--remaining) return false; } } return true; } static bool drain_evictable(struct lruvec *lruvec) { int gen, type, zone; int remaining = MAX_LRU_BATCH; for_each_gen_type_zone(gen, type, zone) { struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; while (!list_empty(head)) { bool success; struct folio *folio = lru_to_folio(head); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); success = lru_gen_del_folio(lruvec, folio, false); VM_WARN_ON_ONCE(!success); lruvec_add_folio(lruvec, folio); if (!--remaining) return false; } } return true; } static void lru_gen_change_state(bool enabled) { static DEFINE_MUTEX(state_mutex); struct mem_cgroup *memcg; cgroup_lock(); cpus_read_lock(); get_online_mems(); mutex_lock(&state_mutex); if (enabled == lru_gen_enabled()) goto unlock; if (enabled) static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); else static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { int nid; for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); VM_WARN_ON_ONCE(!state_is_valid(lruvec)); lruvec->lrugen.enabled = enabled; while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { spin_unlock_irq(&lruvec->lru_lock); cond_resched(); spin_lock_irq(&lruvec->lru_lock); } spin_unlock_irq(&lruvec->lru_lock); } cond_resched(); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); unlock: mutex_unlock(&state_mutex); put_online_mems(); cpus_read_unlock(); cgroup_unlock(); } /****************************************************************************** * sysfs interface ******************************************************************************/ static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { unsigned int msecs; if (kstrtouint(buf, 0, &msecs)) return -EINVAL; WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); return len; } static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms); static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int caps = 0; if (get_cap(LRU_GEN_CORE)) caps |= BIT(LRU_GEN_CORE); if (should_walk_mmu()) caps |= BIT(LRU_GEN_MM_WALK); if (should_clear_pmd_young()) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); return sysfs_emit(buf, "0x%04x\n", caps); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { int i; unsigned int caps; if (tolower(*buf) == 'n') caps = 0; else if (tolower(*buf) == 'y') caps = -1; else if (kstrtouint(buf, 0, &caps)) return -EINVAL; for (i = 0; i < NR_LRU_GEN_CAPS; i++) { bool enabled = caps & BIT(i); if (i == LRU_GEN_CORE) lru_gen_change_state(enabled); else if (enabled) static_branch_enable(&lru_gen_caps[i]); else static_branch_disable(&lru_gen_caps[i]); } return len; } static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled); static struct attribute *lru_gen_attrs[] = { &lru_gen_min_ttl_attr.attr, &lru_gen_enabled_attr.attr, NULL }; static const struct attribute_group lru_gen_attr_group = { .name = "lru_gen", .attrs = lru_gen_attrs, }; /****************************************************************************** * debugfs interface ******************************************************************************/ static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) { struct mem_cgroup *memcg; loff_t nr_to_skip = *pos; m->private = kvmalloc(PATH_MAX, GFP_KERNEL); if (!m->private) return ERR_PTR(-ENOMEM); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { int nid; for_each_node_state(nid, N_MEMORY) { if (!nr_to_skip--) return get_lruvec(memcg, nid); } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); return NULL; } static void lru_gen_seq_stop(struct seq_file *m, void *v) { if (!IS_ERR_OR_NULL(v)) mem_cgroup_iter_break(NULL, lruvec_memcg(v)); kvfree(m->private); m->private = NULL; } static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) { int nid = lruvec_pgdat(v)->node_id; struct mem_cgroup *memcg = lruvec_memcg(v); ++*pos; nid = next_memory_node(nid); if (nid == MAX_NUMNODES) { memcg = mem_cgroup_iter(NULL, memcg, NULL); if (!memcg) return NULL; nid = first_memory_node; } return get_lruvec(memcg, nid); } static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, unsigned long seq) { int i; int type, tier; int hist = lru_hist_from_seq(seq); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); for (type = 0; type < ANON_AND_FILE; type++) { const char *s = " "; unsigned long n[3] = {}; if (seq == max_seq) { s = "RT "; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); if (tier) n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); } for (i = 0; i < 3; i++) seq_printf(m, " %10lu%c", n[i], s[i]); } seq_putc(m, '\n'); } if (!mm_state) return; seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { const char *s = " "; unsigned long n = 0; if (seq == max_seq && NR_HIST_GENS == 1) { s = "LOYNFA"; n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { s = "loynfa"; n = READ_ONCE(mm_state->stats[hist][i]); } seq_printf(m, " %10lu%c", n, s[i]); } seq_putc(m, '\n'); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static int lru_gen_seq_show(struct seq_file *m, void *v) { unsigned long seq; bool full = !debugfs_real_fops(m->file)->write; struct lruvec *lruvec = v; struct lru_gen_folio *lrugen = &lruvec->lrugen; int nid = lruvec_pgdat(lruvec)->node_id; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); if (nid == first_memory_node) { const char *path = memcg ? m->private : ""; #ifdef CONFIG_MEMCG if (memcg) cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); #endif seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); } seq_printf(m, " node %5d\n", nid); if (!full) seq = min_seq[LRU_GEN_ANON]; else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else seq = 0; for (; seq <= max_seq; seq++) { int type, zone; int gen = lru_gen_from_seq(seq); unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); for (type = 0; type < ANON_AND_FILE; type++) { unsigned long size = 0; char mark = full && seq < min_seq[type] ? 'x' : ' '; for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); seq_printf(m, " %10lu%c", size, mark); } seq_putc(m, '\n'); if (full) lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); } return 0; } static const struct seq_operations lru_gen_seq_ops = { .start = lru_gen_seq_start, .stop = lru_gen_seq_stop, .next = lru_gen_seq_next, .show = lru_gen_seq_show, }; static int run_aging(struct lruvec *lruvec, unsigned long seq, bool can_swap, bool force_scan) { DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); if (seq < max_seq) return 0; if (seq > max_seq) return -EINVAL; if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) return -ERANGE; try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); return 0; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long nr_to_reclaim) { DEFINE_MAX_SEQ(lruvec); if (seq + MIN_NR_GENS > max_seq) return -EINVAL; sc->nr_reclaimed = 0; while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); if (seq < min_seq[!swappiness]) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) return 0; if (!evict_folios(lruvec, sc, swappiness)) return 0; cond_resched(); } return -EINTR; } static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, struct scan_control *sc, int swappiness, unsigned long opt) { struct lruvec *lruvec; int err = -EINVAL; struct mem_cgroup *memcg = NULL; if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) return -EINVAL; if (!mem_cgroup_disabled()) { rcu_read_lock(); memcg = mem_cgroup_from_id(memcg_id); if (!mem_cgroup_tryget(memcg)) memcg = NULL; rcu_read_unlock(); if (!memcg) return -EINVAL; } if (memcg_id != mem_cgroup_id(memcg)) goto done; lruvec = get_lruvec(memcg, nid); if (swappiness < 0) swappiness = get_swappiness(lruvec, sc); else if (swappiness > 200) goto done; switch (cmd) { case '+': err = run_aging(lruvec, seq, swappiness, opt); break; case '-': err = run_eviction(lruvec, seq, sc, swappiness, opt); break; } done: mem_cgroup_put(memcg); return err; } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, size_t len, loff_t *pos) { void *buf; char *cur, *next; unsigned int flags; struct blk_plug plug; int err = -EINVAL; struct scan_control sc = { .may_writepage = true, .may_unmap = true, .may_swap = true, .reclaim_idx = MAX_NR_ZONES - 1, .gfp_mask = GFP_KERNEL, }; buf = kvmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, src, len)) { kvfree(buf); return -EFAULT; } set_task_reclaim_state(current, &sc.reclaim_state); flags = memalloc_noreclaim_save(); blk_start_plug(&plug); if (!set_mm_walk(NULL, true)) { err = -ENOMEM; goto done; } next = buf; next[len] = '\0'; while ((cur = strsep(&next, ",;\n"))) { int n; int end; char cmd; unsigned int memcg_id; unsigned int nid; unsigned long seq; unsigned int swappiness = -1; unsigned long opt = -1; cur = skip_spaces(cur); if (!*cur) continue; n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, &seq, &end, &swappiness, &end, &opt, &end); if (n < 4 || cur[end]) { err = -EINVAL; break; } err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); if (err) break; } done: clear_mm_walk(); blk_finish_plug(&plug); memalloc_noreclaim_restore(flags); set_task_reclaim_state(current, NULL); kvfree(buf); return err ? : len; } static int lru_gen_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &lru_gen_seq_ops); } static const struct file_operations lru_gen_rw_fops = { .open = lru_gen_seq_open, .read = seq_read, .write = lru_gen_seq_write, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations lru_gen_ro_fops = { .open = lru_gen_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; /****************************************************************************** * initialization ******************************************************************************/ void lru_gen_init_pgdat(struct pglist_data *pgdat) { int i, j; spin_lock_init(&pgdat->memcg_lru.lock); for (i = 0; i < MEMCG_NR_GENS; i++) { for (j = 0; j < MEMCG_NR_BINS; j++) INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i); } } void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; int gen, type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); for (i = 0; i <= MIN_NR_GENS + 1; i++) lrugen->timestamps[i] = jiffies; for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); if (mm_state) mm_state->seq = MIN_NR_GENS; } #ifdef CONFIG_MEMCG void lru_gen_init_memcg(struct mem_cgroup *memcg) { struct lru_gen_mm_list *mm_list = get_mm_list(memcg); if (!mm_list) return; INIT_LIST_HEAD(&mm_list->fifo); spin_lock_init(&mm_list->lock); } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { int i; int nid; struct lru_gen_mm_list *mm_list = get_mm_list(memcg); VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); lruvec->lrugen.list.next = LIST_POISON1; if (!mm_state) continue; for (i = 0; i < NR_BLOOM_FILTERS; i++) { bitmap_free(mm_state->filters[i]); mm_state->filters[i] = NULL; } } } #endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) { BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) pr_err("lru_gen: failed to create sysfs group\n"); debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); return 0; }; late_initcall(init_lru_gen); #else /* !CONFIG_LRU_GEN */ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { BUILD_BUG(); } static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { BUILD_BUG(); } static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { BUILD_BUG(); } #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; bool proportional_reclaim; struct blk_plug plug; if (lru_gen_enabled() && !root_reclaim(sc)) { lru_gen_shrink_lruvec(lruvec, sc); return; } get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); /* * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal * event that can occur when there is little memory pressure e.g. * multiple streaming readers/writers. Hence, we do not abort scanning * when the requested number of pages are reclaimed when scanning at * DEF_PRIORITY on the assumption that the fact we are direct * reclaiming implies that kswapd is not keeping up and it is best to * do a batch of work at once. For memcg reclaim one check is made to * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { unsigned long nr_anon, nr_file, percentage; unsigned long nr_scanned; for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, lruvec, sc); } } cond_resched(); if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) continue; /* * For kswapd and memcg, reclaim at least the number of pages * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. */ nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; /* * It's just vindictive to attack the larger once the smaller * has gone to zero. And given the way we stop scanning the * smaller below, this makes sure that we only make one nudge * towards proportionality once we've got nr_to_reclaim. */ if (!nr_file || !nr_anon) break; if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; lru = LRU_BASE; percentage = nr_anon * 100 / scan_target; } else { unsigned long scan_target = targets[LRU_INACTIVE_FILE] + targets[LRU_ACTIVE_FILE] + 1; lru = LRU_FILE; percentage = nr_file * 100 / scan_target; } /* Stop scanning the smaller of the LRU */ nr[lru] = 0; nr[lru + LRU_ACTIVE] = 0; /* * Recalculate the other LRU scan count based on its original * scan target and the percentage scanning already complete */ lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); lru += LRU_ACTIVE; nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; return false; } /* * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; int z; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) return false; /* * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX * number of pages that were scanned. This will return to the caller * with the risk reclaim/compaction and the resulting allocation attempt * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL * allocations through requiring that the full LRU list has been scanned * first, by assuming that zero delta of sc->nr_scanned means full LRU * scan, but that approximation was wrong, and there were corner cases * where always a non-zero amount of pages were scanned. */ if (!nr_reclaimed) return false; /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { struct zone *zone = &pgdat->node_zones[z]; if (!managed_zone(zone)) continue; /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) return false; if (compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; } /* * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming */ pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); return inactive_lru_pages > pages_for_compaction; } static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; struct mem_cgroup *memcg; memcg = mem_cgroup_iter(target_memcg, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; unsigned long scanned; /* * This loop can become CPU-bound when target memcgs * aren't eligible for reclaim - either because they * don't have any reclaimable pages, or because their * memory is explicitly protected. Avoid soft lockups. */ cond_resched(); mem_cgroup_calculate_protection(target_memcg, memcg); if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; } else if (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as * there is an unprotected supply * of reclaimable memory from other cgroups. */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); /* Record the group's reclaim efficiency */ if (!sc->proactive) vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; struct lruvec *target_lruvec; bool reclaimable = false; if (lru_gen_enabled() && root_reclaim(sc)) { lru_gen_shrink_node(pgdat, sc); return; } target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; prepare_scan_control(pgdat, sc); shrink_node_memcgs(pgdat, sc); flush_reclaim_state(sc); nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed; /* Record the subtree's reclaim efficiency */ if (!sc->proactive) vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, nr_node_reclaimed); if (nr_node_reclaimed) reclaimable = true; if (current_is_kswapd()) { /* * If reclaim is isolating dirty pages under writeback, * it implies that the long-lived page allocation rate * is exceeding the page laundering rate. Either the * global limits are not being effective at throttling * processes due to the page distribution throughout * zones or there is heavy usage of a slow backing * device. The only option is to throttle from reclaim * context which is not ideal as there is no guarantee * the dirtying process is throttled in the same way * balance_dirty_pages() manages. * * Once a node is flagged PGDAT_WRITEBACK, kswapd will * count the number of pages under pages flagged for * immediate reclaim and stall if any are encountered * in the nr_immediate check below. */ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty == sc->nr.file_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); /* * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU * faster than they are written so forcibly stall * until some pages complete writeback. */ if (sc->nr.immediate) reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); } /* * Tag a node/memcg as congested if all the dirty pages were marked * for writeback and immediate reclaim (counted in nr.congested). * * Legacy memcg will stall in page writeback so avoid forcibly * stalling in reclaim_throttle(). */ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) { if (cgroup_reclaim(sc) && writeback_throttling_sane(sc)) set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags); if (current_is_kswapd()) set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); } /* * Stall direct reclaim for IO completions if the lruvec is * node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!current_is_kswapd() && current_may_throttle() && !sc->hibernation_mode && (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) || test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags))) reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc)) goto again; /* * Kswapd gives up on balancing particular nodes after too * many failures to reclaim anything from them and goes to * sleep. On reclaim progress, reset the failure counter. A * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) pgdat->kswapd_failures = 0; else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } /* * Returns true if compaction should go ahead for a costly-order request, or * the allocation would already succeed without compaction. Return false if we * should reclaim first. */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; if (!gfp_compaction_allowed(sc->gfp_mask)) return false; /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) return true; /* Compaction cannot yet proceed. Do reclaim. */ if (!compaction_suitable(zone, sc->order, sc->reclaim_idx)) return false; /* * Compaction is already possible, but it takes time to run and there * are potentially other callers using the pages just freed. So proceed * with reclaim to make a buffer of free pages available to give * compaction a reasonable chance of completing and allocating the page. * Note that we won't actually reclaim the whole buffer in one attempt * as the target watermark in should_continue_reclaim() is lower. But if * we are already above the high+gap watermark, don't reclaim at all. */ watermark = high_wmark_pages(zone) + compact_gap(sc->order); return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); } static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) { /* * If reclaim is making progress greater than 12% efficiency then * wake all the NOPROGRESS throttled tasks. */ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; if (waitqueue_active(wqh)) wake_up(wqh); return; } /* * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages * under writeback and marked for immediate reclaim at the tail of the * LRU. */ if (current_is_kswapd() || cgroup_reclaim(sc)) return; /* Throttle if making no progress at high prioities. */ if (sc->priority == 1 && !sc->nr_reclaimed) reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); } /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; pg_data_t *last_pgdat = NULL; pg_data_t *first_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; sc->reclaim_idx = gfp_zone(sc->gfp_mask); } for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. */ if (!cgroup_reclaim(sc)) { if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. * Even though compaction is invoked for any * non-zero order, only frequent costly order * reclamation is disruptive enough to become a * noticeable problem, like transparent huge * page allocations. */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && compaction_ready(zone, sc)) { sc->compaction_ready = true; continue; } /* * Shrink each node in the zonelist once. If the * zonelist is ordered by zone (not the default) then a * node may be shrunk multiple times but in that case * the user prefers lower zones being preserved. */ if (zone->zone_pgdat == last_pgdat) continue; /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and * scanned pages. This works for global memory pressure * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, sc->order, sc->gfp_mask, &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } if (!first_pgdat) first_pgdat = zone->zone_pgdat; /* See comment about same check for global reclaim above */ if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); } if (first_pgdat) consider_reclaim_throttle(first_pgdat, sc); /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; } static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { struct lruvec *target_lruvec; unsigned long refaults; if (lru_gen_enabled()) return; target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[WORKINGSET_ANON] = refaults; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); target_lruvec->refaults[WORKINGSET_FILE] = refaults; } /* * This is the main entry point to direct page reclaim. * * If a full scan of the inactive list fails to free enough memory then we * are "out of memory" and something needs to be killed. * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this * caller can't do much about. We kick the writeback threads and take explicit * naps in the hope that some of these pages can be written. But if the * allocating task holds filesystem locks which prevent writeout this might not * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; pg_data_t *last_pgdat; struct zoneref *z; struct zone *zone; retry: delayacct_freepages_start(); if (!cgroup_reclaim(sc)) __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { if (!sc->proactive) vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); if (sc->nr_reclaimed >= sc->nr_to_reclaim) break; if (sc->compaction_ready) break; /* * If we're getting trouble reclaiming, start doing * writepage even in laptop mode. */ if (sc->priority < DEF_PRIORITY - 2) sc->may_writepage = 1; } while (--sc->priority >= 0); last_pgdat = NULL; for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); if (cgroup_reclaim(sc)) { struct lruvec *lruvec; lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); } } delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; /* Aborted reclaim to try compaction? don't OOM, then */ if (sc->compaction_ready) return 1; /* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a * memory.low cgroup setting can exempt large amounts of * memory from reclaim. Neither of which are very common, so * instead of doing costly eligibility calculations of the * entire cgroup subtree up front, we assume the estimates are * good, and retry with forcible deactivation if that fails. */ if (sc->skipped_deactivate) { sc->priority = initial_priority; sc->force_deactivate = 1; sc->skipped_deactivate = 0; goto retry; } /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; sc->force_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; } return 0; } static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; unsigned long free_pages = 0; int i; bool wmark_ok; if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!managed_zone(zone)) continue; if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES); } /* If there are no reserves (unexpected config) then do not throttle */ if (!pfmemalloc_reserve) return true; wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } return wmark_ok; } /* * Throttle direct reclaimers if backing storage is backed by the network * and the PFMEMALLOC reserve for the preferred node is getting dangerously * depleted. kswapd will continue to make progress and wake the processes * when the low watermark is reached. * * Returns true if a fatal signal was delivered during throttling. If this * happens, the page allocator should not consider triggering the OOM killer. */ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { struct zoneref *z; struct zone *zone; pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly * responsible for cleaning pages necessary for reclaim to make forward * progress. kjournald for example may enter direct reclaim while * committing a transaction where throttling it could forcing other * processes to block on log_wait_commit(). */ if (current->flags & PF_KTHREAD) goto out; /* * If a fatal signal is pending, this process should not throttle. * It should return quickly so it can exit and free its memory */ if (fatal_signal_pending(current)) goto out; /* * Check if the pfmemalloc reserves are ok by finding the first node * with a usable ZONE_NORMAL or lower zone. The expectation is that * GFP_KERNEL will be required for allocating network buffers when * swapping over the network so ZONE_HIGHMEM is unusable. * * Throttling is based on the first usable node and throttled processes * wait on a queue until kswapd makes progress and wakes them. There * is an affinity then between processes waking up and where reclaim * progress has been made assuming the process wakes on the same node. * More importantly, processes running on remote nodes will not compete * for remote pfmemalloc reserves and processes on different nodes * should make reasonable progress. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { if (zone_idx(zone) > ZONE_NORMAL) continue; /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; if (allow_direct_reclaim(pgdat)) goto out; break; } /* If no zone was usable by the allocation flags then do not throttle */ if (!pgdat) goto out; /* Account for the throttling */ count_vm_event(PGSCAN_DIRECT_THROTTLE); /* * If the caller cannot enter the filesystem, it's possible that it * is due to the caller holding an FS lock or performing a journal * transaction in the case of a filesystem like ext[3|4]. In this case, * it is not safe to block on pfmemalloc_wait as kswapd could be * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat), HZ); else /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat)); if (fatal_signal_pending(current)) return true; out: return false; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, }; /* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point. */ if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); set_task_reclaim_state(current, NULL); return nr_reclaimed; } #ifdef CONFIG_MEMCG /* Only used by soft limit reclaim. Do not reuse for anything else. */ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; WARN_ON_ONCE(!current->reclaim_state); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.gfp_mask); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put * equal pressure on all the nodes. This is based on the assumption that * the reclaim does not bail out early. */ struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); set_task_reclaim_state(current, NULL); return nr_reclaimed; } #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; struct lruvec *lruvec; if (lru_gen_enabled()) { lru_gen_age_node(pgdat, sc); return; } if (!can_age_anon_pages(pgdat, sc)) return; lruvec = mem_cgroup_lruvec(NULL, pgdat); if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) return; memcg = mem_cgroup_iter(NULL, NULL, NULL); do { lruvec = mem_cgroup_lruvec(memcg, pgdat); shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; /* * Check for watermark boosts top-down as the higher zones * are more likely to be boosted. Both watermarks and boosts * should not be checked at the same time as reclaim would * start prematurely when there is no boosting and a lower * zone is balanced. */ for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; if (zone->watermark_boost) return true; } return false; } /* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx */ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; struct zone *zone; /* * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) mark = wmark_pages(zone, WMARK_PROMO); else mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* * If a node has no managed zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ if (mark == -1) return true; return false; } /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags); clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* * Prepare kswapd for sleeping. This verifies that there are no processes * waiting in throttle_direct_reclaim() and that watermarks have been met. * * Returns true if kswapd is ready to sleep */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the * zones, which causes kswapd to exit balance_pgdat() before reaching * the wake up checks. If kswapd is going to sleep, no process should * be sleeping on pfmemalloc_wait, so wake them now if necessary. If * the wake up is premature, processes will wake kswapd and get * throttled again. The difference from wake ups in balance_pgdat() is * that here we are under prepare_to_wait(). */ if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } return false; } /* * kswapd shrinks a node of pages that are at or below the highest usable * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct zone *zone; int z; /* Reclaim a number of pages proportional to the number of zones */ sc->nr_to_reclaim = 0; for (z = 0; z <= sc->reclaim_idx; z++) { zone = pgdat->node_zones + z; if (!managed_zone(zone)) continue; sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); } /* * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order. */ shrink_node(pgdat, sc); /* * Fragmentation may mean that the system cannot be rebalanced for * high-order allocations. If twice the allocation size has been * reclaimed then recheck watermarks only at order-0 to prevent * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. */ if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; } /* Page allocator PCP high watermark is lowered if reclaim is active. */ static inline void update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) { int i; struct zone *zone; for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; if (active) set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); else clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); } } static inline void set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { update_reclaim_active(pgdat, highest_zoneidx, true); } static inline void clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) { update_reclaim_active(pgdat, highest_zoneidx, false); } /* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is * balanced. * * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long pflags; unsigned long nr_boost_reclaim; unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; bool boosted; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, .may_unmap = 1, }; set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); /* * Account for the reclaim boost. Note that the zone boost is left in * place so that parallel allocations that are near the watermark will * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; nr_boost_reclaim += zone->watermark_boost; zone_boosts[i] = zone->watermark_boost; } boosted = nr_boost_reclaim; restart: set_reclaim_active(pgdat, highest_zoneidx); sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; bool balanced; bool ret; bool was_frozen; sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed * then consider reclaiming from all zones. This has a dual * purpose -- on 64-bit systems it is expected that * buffer_heads are stripped during active rotation. On 32-bit * systems, highmem pages can pin lowmem memory and shrinking * buffers can relieve lowmem pressure. Reclaim may still not * go ahead if all eligible zones for the original allocation * request are balanced to avoid excessive reclaim from kswapd. */ if (buffer_heads_over_limit) { for (i = MAX_NR_ZONES - 1; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; sc.reclaim_idx = i; break; } } /* * If the pgdat is imbalanced then ignore boosting and preserve * the watermarks for a later time and restart. Note that the * zone watermarks will be still reset at the end of balancing * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; } /* * If boosting is not active then only reclaim if there are no * eligible zones. Note that sc.reclaim_idx is not used as * buffer_heads_over_limit may have adjusted it. */ if (!nr_boost_reclaim && balanced) goto out; /* Limit the priority of boosting to avoid reclaim writeback */ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) raise_priority = false; /* * Do not writeback or swap pages for boosted reclaim. The * intent is to relieve pressure not issue sub-optimal IO * from reclaim context. If no pages are reclaimed, the * reclaim will be aborted. */ sc.may_writepage = !laptop_mode && !nr_boost_reclaim; sc.may_swap = !nr_boost_reclaim; /* * Do some background aging, to give pages a chance to be * referenced before reclaiming. All pages are rotated * regardless of classzone as this is about consistent aging. */ kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ sc.nr_scanned = 0; nr_soft_scanned = 0; nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, sc.gfp_mask, &nr_soft_scanned); sc.nr_reclaimed += nr_soft_reclaimed; /* * There should be no need to raise the scanning priority if * enough pages are already being scanned that that high * watermark would be met at 100% efficiency. */ if (kswapd_shrink_node(pgdat, &sc)) raise_priority = false; /* * If the low watermark is met there is no need for processes * to be throttled on pfmemalloc_wait as they should not be * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ __fs_reclaim_release(_THIS_IP_); ret = kthread_freezable_should_stop(&was_frozen); __fs_reclaim_acquire(_THIS_IP_); if (was_frozen || ret) break; /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); /* * If reclaim made no progress for a boost, stop reclaim as * IO cannot be queued and it could be an infinite loop in * extreme circumstances. */ if (nr_boost_reclaim && !nr_reclaimed) break; if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); /* * Restart only if it went through the priority loop all the way, * but cache_trim_mode didn't work. */ if (!sc.nr_reclaimed && sc.priority < 1 && !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) { sc.no_cache_trim_mode = 1; goto restart; } if (!sc.nr_reclaimed) pgdat->kswapd_failures++; out: clear_reclaim_active(pgdat, highest_zoneidx); /* If reclaim was boosted, account for the reclaim done in this pass */ if (boosted) { unsigned long flags; for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; /* Increments are under the zone lock */ zone = pgdat->node_zones + i; spin_lock_irqsave(&zone->lock, flags); zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); spin_unlock_irqrestore(&zone->lock, flags); } /* * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller * entered the allocator slow path while kswapd was awake, order will * remain at the higher level. */ return sc.order; } /* * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is * not a valid index then either kswapd runs for first time or kswapd couldn't * sleep after previous reclaim attempt (node is still unbalanced). In that * case return the zone index of the previous kswapd reclaim cycle. */ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, enum zone_type prev_highest_zoneidx) { enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); if (freezing(current) || kthread_should_stop()) return; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* * Try to sleep for a short interval. Note that kcompactd will only be * woken if it is possible to sleep for a short interval. This is * deliberate on the assumption that if reclaim cannot keep an * eligible zone balanced that it's also unlikely that compaction will * succeed. */ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. * When kswapd is going to sleep, it is reasonable to assume * that pages and compaction may succeed so reset the cache. */ reset_isolation_suitable(pgdat); /* * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { WRITE_ONCE(pgdat->kswapd_highest_zoneidx, kswapd_highest_zoneidx(pgdat, highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); } finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } /* * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ if (!remaining && prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* * vmstat counters are not perfectly accurate and the estimated * value for counters such as NR_FREE_PAGES can deviate from the * true value by nr_online_cpus * threshold. To avoid the zone * watermarks being breached while under pressure, we reduce the * per-cpu vmstat threshold while kswapd is awake and restore * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); if (!kthread_should_stop()) schedule(); set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); else count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); } finish_wait(&pgdat->kswapd_wait, &wait); } /* * The background pageout daemon, started as a kernel thread * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity * that frees anything up. This is needed for things like routing * etc, where we otherwise might have all activity going on in * asynchronous contexts that cannot page things out. * * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it * regardless (see "__alloc_pages()"). "kswapd" should * never get caught in the normal page freeing logic. * * (Kswapd normally doesn't need memory anyway, but sometimes * you need a small amount of memory in order to be able to * page out something else, and this flag essentially protects * us from recursively trying to free more memory as we're * trying to free the first piece of memory in the first place). */ tsk->flags |= PF_MEMALLOC | PF_KSWAPD; set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); atomic_set(&pgdat->nr_writeback_throttled, 0); for ( ; ; ) { bool was_frozen; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, highest_zoneidx); /* Read the new order and highest_zoneidx */ alloc_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); if (kthread_freezable_should_stop(&was_frozen)) break; /* * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ if (was_frozen) continue; /* * Reclaim begins at the requested order but if a high-order * reclaim fails then kswapd falls back to reclaiming for * order-0. If that happens, kswapd will consider sleeping * for the order it finished reclaiming at (reclaim_order) * but kcompactd is woken to compact for the original * request (alloc_order). */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); return 0; } /* * A zone is low on free memory or too fragmented for high-order memory. If * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim * has failed or is not needed, still wake up kcompactd if only compaction is * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; if (!managed_zone(zone)) return; if (!cpuset_zone_allowed(zone, gfp_flags)) return; pgdat = zone->zone_pgdat; curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd * and rely on compaction_suitable() to determine if it's * needed. If it fails, it will defer subsequent attempts to * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, .reclaim_idx = MAX_NR_ZONES - 1, .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, .hibernation_mode = 1, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); unsigned long nr_reclaimed; unsigned int noreclaim_flag; fs_reclaim_acquire(sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(current, &sc.reclaim_state); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); set_task_reclaim_state(current, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ /* * This kswapd start function will be called by init and node-hot-add. */ void __meminit kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) { pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ pr_err("Failed to start kswapd on node %d,ret=%ld\n", nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); pgdat->kswapd = NULL; } } pgdat_kswapd_unlock(pgdat); } /* * Called by memory hotplug when all memory in a node is offlined. Caller must * be holding mem_hotplug_begin/done(). */ void __meminit kswapd_stop(int nid) { pg_data_t *pgdat = NODE_DATA(nid); struct task_struct *kswapd; pgdat_kswapd_lock(pgdat); kswapd = pgdat->kswapd; if (kswapd) { kthread_stop(kswapd); pgdat->kswapd = NULL; } pgdat_kswapd_unlock(pgdat); } static int __init kswapd_init(void) { int nid; swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); return 0; } module_init(kswapd_init) #ifdef CONFIG_NUMA /* * Node reclaim mode * * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ int node_reclaim_mode __read_mostly; /* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ #define NODE_RECLAIM_PRIORITY 4 /* * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; /* * If the number of slab pages in a zone grows beyond this percentage then * slab reclaim needs to occur. */ int sysctl_min_slab_ratio = 5; static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + node_page_state(pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than * accounted for by the pages on the file LRU lists because * tmpfs pages accounted for as ANON can also be FILE_MAPPED */ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { unsigned long nr_pagecache_reclaimable; unsigned long delta = 0; /* * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ if (node_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); /* If we can't clean pages, remove dirty pages from consideration */ if (!(node_reclaim_mode & RECLAIM_WRITE)) delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) delta = nr_pagecache_reclaimable; return nr_pagecache_reclaimable - delta; } /* * Try to free up some pages from this node through reclaim. */ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, .reclaim_idx = gfp_zone(gfp_mask), }; unsigned long pflags; trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, sc.gfp_mask); cond_resched(); psi_memstall_enter(&pflags); delayacct_freepages_start(); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP */ noreclaim_flag = memalloc_noreclaim_save(); set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { shrink_node(pgdat, &sc); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } set_task_reclaim_state(p, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); psi_memstall_leave(&pflags); delayacct_freepages_end(); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); return sc.nr_reclaimed >= nr_pages; } int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { int ret; /* * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately * thrown out if the node is overallocated. So we do not reclaim * if less than a specified percentage of the node is used by * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return NODE_RECLAIM_NOSCAN; /* * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) return NODE_RECLAIM_NOSCAN; if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) return NODE_RECLAIM_NOSCAN; ret = __node_reclaim(pgdat, gfp_mask, order); clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); return ret; } #endif /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list * @fbatch: Batch of lru folios to check. * * Checks folios for evictability, if an evictable folio is in the unevictable * lru list, moves it to the appropriate evictable lru list. This function * should be only used for lru folios. */ void check_move_unevictable_folios(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < fbatch->nr; i++) { struct folio *folio = fbatch->folios[i]; int nr_pages = folio_nr_pages(folio); pgscanned += nr_pages; /* block memcg migration while the folio moves between lrus */ if (!folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (folio_evictable(folio) && folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); pgrescued += nr_pages; } folio_set_lru(folio); } if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); unlock_page_lruvec_irq(lruvec); } else if (pgscanned) { count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } } EXPORT_SYMBOL_GPL(check_move_unevictable_folios); |
1 1 1 1 1 1 5 5 5 5 4 5 5 5 5 5 5 5 4 4 5 1 1 1 1 1 5 5 5 5 5 5 5 5 5 1 5 4 4 4 4 5 5 5 5 5 5 5 3 4 1 5 5 5 1 1 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sunrpc/rpc_pipe.c * * Userland/kernel interface for rpcauth_gss. * Code shamelessly plagiarized from fs/nfsd/nfsctl.c * and fs/sysfs/inode.c * * Copyright (c) 2002, Trond Myklebust <trond.myklebust@fys.uio.no> * */ #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/namei.h> #include <linux/fsnotify.h> #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/utsname.h> #include <asm/ioctls.h> #include <linux/poll.h> #include <linux/wait.h> #include <linux/seq_file.h> #include <linux/sunrpc/clnt.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/cache.h> #include <linux/nsproxy.h> #include <linux/notifier.h> #include "netns.h" #include "sunrpc.h" #define RPCDBG_FACILITY RPCDBG_DEBUG #define NET_NAME(net) ((net == &init_net) ? " (init_net)" : "") static struct file_system_type rpc_pipe_fs_type; static const struct rpc_pipe_ops gssd_dummy_pipe_ops; static struct kmem_cache *rpc_inode_cachep __read_mostly; #define RPC_UPCALL_TIMEOUT (30*HZ) static BLOCKING_NOTIFIER_HEAD(rpc_pipefs_notifier_list); int rpc_pipefs_notifier_register(struct notifier_block *nb) { return blocking_notifier_chain_register(&rpc_pipefs_notifier_list, nb); } EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_register); void rpc_pipefs_notifier_unregister(struct notifier_block *nb) { blocking_notifier_chain_unregister(&rpc_pipefs_notifier_list, nb); } EXPORT_SYMBOL_GPL(rpc_pipefs_notifier_unregister); static void rpc_purge_list(wait_queue_head_t *waitq, struct list_head *head, void (*destroy_msg)(struct rpc_pipe_msg *), int err) { struct rpc_pipe_msg *msg; if (list_empty(head)) return; do { msg = list_entry(head->next, struct rpc_pipe_msg, list); list_del_init(&msg->list); msg->errno = err; destroy_msg(msg); } while (!list_empty(head)); if (waitq) wake_up(waitq); } static void rpc_timeout_upcall_queue(struct work_struct *work) { LIST_HEAD(free_list); struct rpc_pipe *pipe = container_of(work, struct rpc_pipe, queue_timeout.work); void (*destroy_msg)(struct rpc_pipe_msg *); struct dentry *dentry; spin_lock(&pipe->lock); destroy_msg = pipe->ops->destroy_msg; if (pipe->nreaders == 0) { list_splice_init(&pipe->pipe, &free_list); pipe->pipelen = 0; } dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL, &free_list, destroy_msg, -ETIMEDOUT); dput(dentry); } ssize_t rpc_pipe_generic_upcall(struct file *filp, struct rpc_pipe_msg *msg, char __user *dst, size_t buflen) { char *data = (char *)msg->data + msg->copied; size_t mlen = min(msg->len - msg->copied, buflen); unsigned long left; left = copy_to_user(dst, data, mlen); if (left == mlen) { msg->errno = -EFAULT; return -EFAULT; } mlen -= left; msg->copied += mlen; msg->errno = 0; return mlen; } EXPORT_SYMBOL_GPL(rpc_pipe_generic_upcall); /** * rpc_queue_upcall - queue an upcall message to userspace * @pipe: upcall pipe on which to queue given message * @msg: message to queue * * Call with an @inode created by rpc_mkpipe() to queue an upcall. * A userspace process may then later read the upcall by performing a * read on an open file for this inode. It is up to the caller to * initialize the fields of @msg (other than @msg->list) appropriately. */ int rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg) { int res = -EPIPE; struct dentry *dentry; spin_lock(&pipe->lock); if (pipe->nreaders) { list_add_tail(&msg->list, &pipe->pipe); pipe->pipelen += msg->len; res = 0; } else if (pipe->flags & RPC_PIPE_WAIT_FOR_OPEN) { if (list_empty(&pipe->pipe)) queue_delayed_work(rpciod_workqueue, &pipe->queue_timeout, RPC_UPCALL_TIMEOUT); list_add_tail(&msg->list, &pipe->pipe); pipe->pipelen += msg->len; res = 0; } dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); if (dentry) { wake_up(&RPC_I(d_inode(dentry))->waitq); dput(dentry); } return res; } EXPORT_SYMBOL_GPL(rpc_queue_upcall); static inline void rpc_inode_setowner(struct inode *inode, void *private) { RPC_I(inode)->private = private; } static void rpc_close_pipes(struct inode *inode) { struct rpc_pipe *pipe = RPC_I(inode)->pipe; int need_release; LIST_HEAD(free_list); inode_lock(inode); spin_lock(&pipe->lock); need_release = pipe->nreaders != 0 || pipe->nwriters != 0; pipe->nreaders = 0; list_splice_init(&pipe->in_upcall, &free_list); list_splice_init(&pipe->pipe, &free_list); pipe->pipelen = 0; pipe->dentry = NULL; spin_unlock(&pipe->lock); rpc_purge_list(&RPC_I(inode)->waitq, &free_list, pipe->ops->destroy_msg, -EPIPE); pipe->nwriters = 0; if (need_release && pipe->ops->release_pipe) pipe->ops->release_pipe(inode); cancel_delayed_work_sync(&pipe->queue_timeout); rpc_inode_setowner(inode, NULL); RPC_I(inode)->pipe = NULL; inode_unlock(inode); } static struct inode * rpc_alloc_inode(struct super_block *sb) { struct rpc_inode *rpci; rpci = alloc_inode_sb(sb, rpc_inode_cachep, GFP_KERNEL); if (!rpci) return NULL; return &rpci->vfs_inode; } static void rpc_free_inode(struct inode *inode) { kmem_cache_free(rpc_inode_cachep, RPC_I(inode)); } static int rpc_pipe_open(struct inode *inode, struct file *filp) { struct rpc_pipe *pipe; int first_open; int res = -ENXIO; inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; first_open = pipe->nreaders == 0 && pipe->nwriters == 0; if (first_open && pipe->ops->open_pipe) { res = pipe->ops->open_pipe(inode); if (res) goto out; } if (filp->f_mode & FMODE_READ) pipe->nreaders++; if (filp->f_mode & FMODE_WRITE) pipe->nwriters++; res = 0; out: inode_unlock(inode); return res; } static int rpc_pipe_release(struct inode *inode, struct file *filp) { struct rpc_pipe *pipe; struct rpc_pipe_msg *msg; int last_close; inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; msg = filp->private_data; if (msg != NULL) { spin_lock(&pipe->lock); msg->errno = -EAGAIN; list_del_init(&msg->list); spin_unlock(&pipe->lock); pipe->ops->destroy_msg(msg); } if (filp->f_mode & FMODE_WRITE) pipe->nwriters --; if (filp->f_mode & FMODE_READ) { pipe->nreaders --; if (pipe->nreaders == 0) { LIST_HEAD(free_list); spin_lock(&pipe->lock); list_splice_init(&pipe->pipe, &free_list); pipe->pipelen = 0; spin_unlock(&pipe->lock); rpc_purge_list(&RPC_I(inode)->waitq, &free_list, pipe->ops->destroy_msg, -EAGAIN); } } last_close = pipe->nwriters == 0 && pipe->nreaders == 0; if (last_close && pipe->ops->release_pipe) pipe->ops->release_pipe(inode); out: inode_unlock(inode); return 0; } static ssize_t rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) { struct inode *inode = file_inode(filp); struct rpc_pipe *pipe; struct rpc_pipe_msg *msg; int res = 0; inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { res = -EPIPE; goto out_unlock; } msg = filp->private_data; if (msg == NULL) { spin_lock(&pipe->lock); if (!list_empty(&pipe->pipe)) { msg = list_entry(pipe->pipe.next, struct rpc_pipe_msg, list); list_move(&msg->list, &pipe->in_upcall); pipe->pipelen -= msg->len; filp->private_data = msg; msg->copied = 0; } spin_unlock(&pipe->lock); if (msg == NULL) goto out_unlock; } /* NOTE: it is up to the callback to update msg->copied */ res = pipe->ops->upcall(filp, msg, buf, len); if (res < 0 || msg->len == msg->copied) { filp->private_data = NULL; spin_lock(&pipe->lock); list_del_init(&msg->list); spin_unlock(&pipe->lock); pipe->ops->destroy_msg(msg); } out_unlock: inode_unlock(inode); return res; } static ssize_t rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset) { struct inode *inode = file_inode(filp); int res; inode_lock(inode); res = -EPIPE; if (RPC_I(inode)->pipe != NULL) res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len); inode_unlock(inode); return res; } static __poll_t rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) { struct inode *inode = file_inode(filp); struct rpc_inode *rpci = RPC_I(inode); __poll_t mask = EPOLLOUT | EPOLLWRNORM; poll_wait(filp, &rpci->waitq, wait); inode_lock(inode); if (rpci->pipe == NULL) mask |= EPOLLERR | EPOLLHUP; else if (filp->private_data || !list_empty(&rpci->pipe->pipe)) mask |= EPOLLIN | EPOLLRDNORM; inode_unlock(inode); return mask; } static long rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct rpc_pipe *pipe; int len; switch (cmd) { case FIONREAD: inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { inode_unlock(inode); return -EPIPE; } spin_lock(&pipe->lock); len = pipe->pipelen; if (filp->private_data) { struct rpc_pipe_msg *msg; msg = filp->private_data; len += msg->len - msg->copied; } spin_unlock(&pipe->lock); inode_unlock(inode); return put_user(len, (int __user *)arg); default: return -EINVAL; } } static const struct file_operations rpc_pipe_fops = { .owner = THIS_MODULE, .llseek = no_llseek, .read = rpc_pipe_read, .write = rpc_pipe_write, .poll = rpc_pipe_poll, .unlocked_ioctl = rpc_pipe_ioctl, .open = rpc_pipe_open, .release = rpc_pipe_release, }; static int rpc_show_info(struct seq_file *m, void *v) { struct rpc_clnt *clnt = m->private; rcu_read_lock(); seq_printf(m, "RPC server: %s\n", rcu_dereference(clnt->cl_xprt)->servername); seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_program->name, clnt->cl_prog, clnt->cl_vers); seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO)); seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT)); rcu_read_unlock(); return 0; } static int rpc_info_open(struct inode *inode, struct file *file) { struct rpc_clnt *clnt = NULL; int ret = single_open(file, rpc_show_info, NULL); if (!ret) { struct seq_file *m = file->private_data; spin_lock(&file->f_path.dentry->d_lock); if (!d_unhashed(file->f_path.dentry)) clnt = RPC_I(inode)->private; if (clnt != NULL && refcount_inc_not_zero(&clnt->cl_count)) { spin_unlock(&file->f_path.dentry->d_lock); m->private = clnt; } else { spin_unlock(&file->f_path.dentry->d_lock); single_release(inode, file); ret = -EINVAL; } } return ret; } static int rpc_info_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct rpc_clnt *clnt = (struct rpc_clnt *)m->private; if (clnt) rpc_release_client(clnt); return single_release(inode, file); } static const struct file_operations rpc_info_operations = { .owner = THIS_MODULE, .open = rpc_info_open, .read = seq_read, .llseek = seq_lseek, .release = rpc_info_release, }; /* * Description of fs contents. */ struct rpc_filelist { const char *name; const struct file_operations *i_fop; umode_t mode; }; static struct inode * rpc_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); if (!inode) return NULL; inode->i_ino = get_next_ino(); inode->i_mode = mode; simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); break; default: break; } return inode; } static int __rpc_create_common(struct inode *dir, struct dentry *dentry, umode_t mode, const struct file_operations *i_fop, void *private) { struct inode *inode; d_drop(dentry); inode = rpc_get_inode(dir->i_sb, mode); if (!inode) goto out_err; inode->i_ino = iunique(dir->i_sb, 100); if (i_fop) inode->i_fop = i_fop; if (private) rpc_inode_setowner(inode, private); d_add(dentry, inode); return 0; out_err: printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %pd\n", __FILE__, __func__, dentry); dput(dentry); return -ENOMEM; } static int __rpc_create(struct inode *dir, struct dentry *dentry, umode_t mode, const struct file_operations *i_fop, void *private) { int err; err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private); if (err) return err; fsnotify_create(dir, dentry); return 0; } static int __rpc_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, const struct file_operations *i_fop, void *private) { int err; err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private); if (err) return err; inc_nlink(dir); fsnotify_mkdir(dir, dentry); return 0; } static void init_pipe(struct rpc_pipe *pipe) { pipe->nreaders = 0; pipe->nwriters = 0; INIT_LIST_HEAD(&pipe->in_upcall); INIT_LIST_HEAD(&pipe->in_downcall); INIT_LIST_HEAD(&pipe->pipe); pipe->pipelen = 0; INIT_DELAYED_WORK(&pipe->queue_timeout, rpc_timeout_upcall_queue); pipe->ops = NULL; spin_lock_init(&pipe->lock); pipe->dentry = NULL; } void rpc_destroy_pipe_data(struct rpc_pipe *pipe) { kfree(pipe); } EXPORT_SYMBOL_GPL(rpc_destroy_pipe_data); struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags) { struct rpc_pipe *pipe; pipe = kzalloc(sizeof(struct rpc_pipe), GFP_KERNEL); if (!pipe) return ERR_PTR(-ENOMEM); init_pipe(pipe); pipe->ops = ops; pipe->flags = flags; return pipe; } EXPORT_SYMBOL_GPL(rpc_mkpipe_data); static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry, umode_t mode, const struct file_operations *i_fop, void *private, struct rpc_pipe *pipe) { struct rpc_inode *rpci; int err; err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); if (err) return err; rpci = RPC_I(d_inode(dentry)); rpci->private = private; rpci->pipe = pipe; fsnotify_create(dir, dentry); return 0; } static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) { int ret; dget(dentry); ret = simple_rmdir(dir, dentry); d_drop(dentry); if (!ret) fsnotify_rmdir(dir, dentry); dput(dentry); return ret; } static int __rpc_unlink(struct inode *dir, struct dentry *dentry) { int ret; dget(dentry); ret = simple_unlink(dir, dentry); d_drop(dentry); if (!ret) fsnotify_unlink(dir, dentry); dput(dentry); return ret; } static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); rpc_close_pipes(inode); return __rpc_unlink(dir, dentry); } static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, const char *name) { struct qstr q = QSTR_INIT(name, strlen(name)); struct dentry *dentry = d_hash_and_lookup(parent, &q); if (!dentry) { dentry = d_alloc(parent, &q); if (!dentry) return ERR_PTR(-ENOMEM); } if (d_really_is_negative(dentry)) return dentry; dput(dentry); return ERR_PTR(-EEXIST); } /* * FIXME: This probably has races. */ static void __rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { struct inode *dir = d_inode(parent); struct dentry *dentry; struct qstr name; int i; for (i = start; i < eof; i++) { name.name = files[i].name; name.len = strlen(files[i].name); dentry = d_hash_and_lookup(parent, &name); if (dentry == NULL) continue; if (d_really_is_negative(dentry)) goto next; switch (d_inode(dentry)->i_mode & S_IFMT) { default: BUG(); case S_IFREG: __rpc_unlink(dir, dentry); break; case S_IFDIR: __rpc_rmdir(dir, dentry); } next: dput(dentry); } } static void rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { struct inode *dir = d_inode(parent); inode_lock_nested(dir, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); inode_unlock(dir); } static int rpc_populate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof, void *private) { struct inode *dir = d_inode(parent); struct dentry *dentry; int i, err; inode_lock(dir); for (i = start; i < eof; i++) { dentry = __rpc_lookup_create_exclusive(parent, files[i].name); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_bad; switch (files[i].mode & S_IFMT) { default: BUG(); case S_IFREG: err = __rpc_create(dir, dentry, files[i].mode, files[i].i_fop, private); break; case S_IFDIR: err = __rpc_mkdir(dir, dentry, files[i].mode, NULL, private); } if (err != 0) goto out_bad; } inode_unlock(dir); return 0; out_bad: __rpc_depopulate(parent, files, start, eof); inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; } static struct dentry *rpc_mkdir_populate(struct dentry *parent, const char *name, umode_t mode, void *private, int (*populate)(struct dentry *, void *), void *args_populate) { struct dentry *dentry; struct inode *dir = d_inode(parent); int error; inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; error = __rpc_mkdir(dir, dentry, mode, NULL, private); if (error != 0) goto out_err; if (populate != NULL) { error = populate(dentry, args_populate); if (error) goto err_rmdir; } out: inode_unlock(dir); return dentry; err_rmdir: __rpc_rmdir(dir, dentry); out_err: dentry = ERR_PTR(error); goto out; } static int rpc_rmdir_depopulate(struct dentry *dentry, void (*depopulate)(struct dentry *)) { struct dentry *parent; struct inode *dir; int error; parent = dget_parent(dentry); dir = d_inode(parent); inode_lock_nested(dir, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); error = __rpc_rmdir(dir, dentry); inode_unlock(dir); dput(parent); return error; } /** * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace * communication * @parent: dentry of directory to create new "pipe" in * @name: name of pipe * @private: private data to associate with the pipe, for the caller's use * @pipe: &rpc_pipe containing input parameters * * Data is made available for userspace to read by calls to * rpc_queue_upcall(). The actual reads will result in calls to * @ops->upcall, which will be called with the file pointer, * message, and userspace buffer to copy to. * * Writes can come at any time, and do not necessarily have to be * responses to upcalls. They will result in calls to @msg->downcall. * * The @private argument passed here will be available to all these methods * from the file pointer, via RPC_I(file_inode(file))->private. */ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) { struct dentry *dentry; struct inode *dir = d_inode(parent); umode_t umode = S_IFIFO | 0600; int err; if (pipe->ops->upcall == NULL) umode &= ~0444; if (pipe->ops->downcall == NULL) umode &= ~0222; inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops, private, pipe); if (err) goto out_err; out: inode_unlock(dir); return dentry; out_err: dentry = ERR_PTR(err); printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n", __FILE__, __func__, parent, name, err); goto out; } EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry); /** * rpc_unlink - remove a pipe * @dentry: dentry for the pipe, as returned from rpc_mkpipe * * After this call, lookups will no longer find the pipe, and any * attempts to read or write using preexisting opens of the pipe will * return -EPIPE. */ int rpc_unlink(struct dentry *dentry) { struct dentry *parent; struct inode *dir; int error = 0; parent = dget_parent(dentry); dir = d_inode(parent); inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); inode_unlock(dir); dput(parent); return error; } EXPORT_SYMBOL_GPL(rpc_unlink); /** * rpc_init_pipe_dir_head - initialise a struct rpc_pipe_dir_head * @pdh: pointer to struct rpc_pipe_dir_head */ void rpc_init_pipe_dir_head(struct rpc_pipe_dir_head *pdh) { INIT_LIST_HEAD(&pdh->pdh_entries); pdh->pdh_dentry = NULL; } EXPORT_SYMBOL_GPL(rpc_init_pipe_dir_head); /** * rpc_init_pipe_dir_object - initialise a struct rpc_pipe_dir_object * @pdo: pointer to struct rpc_pipe_dir_object * @pdo_ops: pointer to const struct rpc_pipe_dir_object_ops * @pdo_data: pointer to caller-defined data */ void rpc_init_pipe_dir_object(struct rpc_pipe_dir_object *pdo, const struct rpc_pipe_dir_object_ops *pdo_ops, void *pdo_data) { INIT_LIST_HEAD(&pdo->pdo_head); pdo->pdo_ops = pdo_ops; pdo->pdo_data = pdo_data; } EXPORT_SYMBOL_GPL(rpc_init_pipe_dir_object); static int rpc_add_pipe_dir_object_locked(struct net *net, struct rpc_pipe_dir_head *pdh, struct rpc_pipe_dir_object *pdo) { int ret = 0; if (pdh->pdh_dentry) ret = pdo->pdo_ops->create(pdh->pdh_dentry, pdo); if (ret == 0) list_add_tail(&pdo->pdo_head, &pdh->pdh_entries); return ret; } static void rpc_remove_pipe_dir_object_locked(struct net *net, struct rpc_pipe_dir_head *pdh, struct rpc_pipe_dir_object *pdo) { if (pdh->pdh_dentry) pdo->pdo_ops->destroy(pdh->pdh_dentry, pdo); list_del_init(&pdo->pdo_head); } /** * rpc_add_pipe_dir_object - associate a rpc_pipe_dir_object to a directory * @net: pointer to struct net * @pdh: pointer to struct rpc_pipe_dir_head * @pdo: pointer to struct rpc_pipe_dir_object * */ int rpc_add_pipe_dir_object(struct net *net, struct rpc_pipe_dir_head *pdh, struct rpc_pipe_dir_object *pdo) { int ret = 0; if (list_empty(&pdo->pdo_head)) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_lock(&sn->pipefs_sb_lock); ret = rpc_add_pipe_dir_object_locked(net, pdh, pdo); mutex_unlock(&sn->pipefs_sb_lock); } return ret; } EXPORT_SYMBOL_GPL(rpc_add_pipe_dir_object); /** * rpc_remove_pipe_dir_object - remove a rpc_pipe_dir_object from a directory * @net: pointer to struct net * @pdh: pointer to struct rpc_pipe_dir_head * @pdo: pointer to struct rpc_pipe_dir_object * */ void rpc_remove_pipe_dir_object(struct net *net, struct rpc_pipe_dir_head *pdh, struct rpc_pipe_dir_object *pdo) { if (!list_empty(&pdo->pdo_head)) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_lock(&sn->pipefs_sb_lock); rpc_remove_pipe_dir_object_locked(net, pdh, pdo); mutex_unlock(&sn->pipefs_sb_lock); } } EXPORT_SYMBOL_GPL(rpc_remove_pipe_dir_object); /** * rpc_find_or_alloc_pipe_dir_object * @net: pointer to struct net * @pdh: pointer to struct rpc_pipe_dir_head * @match: match struct rpc_pipe_dir_object to data * @alloc: allocate a new struct rpc_pipe_dir_object * @data: user defined data for match() and alloc() * */ struct rpc_pipe_dir_object * rpc_find_or_alloc_pipe_dir_object(struct net *net, struct rpc_pipe_dir_head *pdh, int (*match)(struct rpc_pipe_dir_object *, void *), struct rpc_pipe_dir_object *(*alloc)(void *), void *data) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe_dir_object *pdo; mutex_lock(&sn->pipefs_sb_lock); list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head) { if (!match(pdo, data)) continue; goto out; } pdo = alloc(data); if (!pdo) goto out; rpc_add_pipe_dir_object_locked(net, pdh, pdo); out: mutex_unlock(&sn->pipefs_sb_lock); return pdo; } EXPORT_SYMBOL_GPL(rpc_find_or_alloc_pipe_dir_object); static void rpc_create_pipe_dir_objects(struct rpc_pipe_dir_head *pdh) { struct rpc_pipe_dir_object *pdo; struct dentry *dir = pdh->pdh_dentry; list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head) pdo->pdo_ops->create(dir, pdo); } static void rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh) { struct rpc_pipe_dir_object *pdo; struct dentry *dir = pdh->pdh_dentry; list_for_each_entry(pdo, &pdh->pdh_entries, pdo_head) pdo->pdo_ops->destroy(dir, pdo); } enum { RPCAUTH_info, RPCAUTH_EOF }; static const struct rpc_filelist authfiles[] = { [RPCAUTH_info] = { .name = "info", .i_fop = &rpc_info_operations, .mode = S_IFREG | 0400, }, }; static int rpc_clntdir_populate(struct dentry *dentry, void *private) { return rpc_populate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF, private); } static void rpc_clntdir_depopulate(struct dentry *dentry) { rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); } /** * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs * @dentry: the parent of new directory * @name: the name of new directory * @rpc_client: rpc client to associate with this directory * * This creates a directory at the given @path associated with * @rpc_clnt, which will contain a file named "info" with some basic * information about the client, together with any "pipes" that may * later be created using rpc_mkpipe(). */ struct dentry *rpc_create_client_dir(struct dentry *dentry, const char *name, struct rpc_clnt *rpc_client) { struct dentry *ret; ret = rpc_mkdir_populate(dentry, name, 0555, NULL, rpc_clntdir_populate, rpc_client); if (!IS_ERR(ret)) { rpc_client->cl_pipedir_objects.pdh_dentry = ret; rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); } return ret; } /** * rpc_remove_client_dir - Remove a directory created with rpc_create_client_dir() * @rpc_client: rpc_client for the pipe */ int rpc_remove_client_dir(struct rpc_clnt *rpc_client) { struct dentry *dentry = rpc_client->cl_pipedir_objects.pdh_dentry; if (dentry == NULL) return 0; rpc_destroy_pipe_dir_objects(&rpc_client->cl_pipedir_objects); rpc_client->cl_pipedir_objects.pdh_dentry = NULL; return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate); } static const struct rpc_filelist cache_pipefs_files[3] = { [0] = { .name = "channel", .i_fop = &cache_file_operations_pipefs, .mode = S_IFREG | 0600, }, [1] = { .name = "content", .i_fop = &content_file_operations_pipefs, .mode = S_IFREG | 0400, }, [2] = { .name = "flush", .i_fop = &cache_flush_operations_pipefs, .mode = S_IFREG | 0600, }, }; static int rpc_cachedir_populate(struct dentry *dentry, void *private) { return rpc_populate(dentry, cache_pipefs_files, 0, 3, private); } static void rpc_cachedir_depopulate(struct dentry *dentry) { rpc_depopulate(dentry, cache_pipefs_files, 0, 3); } struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name, umode_t umode, struct cache_detail *cd) { return rpc_mkdir_populate(parent, name, umode, NULL, rpc_cachedir_populate, cd); } void rpc_remove_cache_dir(struct dentry *dentry) { rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate); } /* * populate the filesystem */ static const struct super_operations s_ops = { .alloc_inode = rpc_alloc_inode, .free_inode = rpc_free_inode, .statfs = simple_statfs, }; #define RPCAUTH_GSSMAGIC 0x67596969 /* * We have a single directory with 1 node in it. */ enum { RPCAUTH_lockd, RPCAUTH_mount, RPCAUTH_nfs, RPCAUTH_portmap, RPCAUTH_statd, RPCAUTH_nfsd4_cb, RPCAUTH_cache, RPCAUTH_nfsd, RPCAUTH_gssd, RPCAUTH_RootEOF }; static const struct rpc_filelist files[] = { [RPCAUTH_lockd] = { .name = "lockd", .mode = S_IFDIR | 0555, }, [RPCAUTH_mount] = { .name = "mount", .mode = S_IFDIR | 0555, }, [RPCAUTH_nfs] = { .name = "nfs", .mode = S_IFDIR | 0555, }, [RPCAUTH_portmap] = { .name = "portmap", .mode = S_IFDIR | 0555, }, [RPCAUTH_statd] = { .name = "statd", .mode = S_IFDIR | 0555, }, [RPCAUTH_nfsd4_cb] = { .name = "nfsd4_cb", .mode = S_IFDIR | 0555, }, [RPCAUTH_cache] = { .name = "cache", .mode = S_IFDIR | 0555, }, [RPCAUTH_nfsd] = { .name = "nfsd", .mode = S_IFDIR | 0555, }, [RPCAUTH_gssd] = { .name = "gssd", .mode = S_IFDIR | 0555, }, }; /* * This call can be used only in RPC pipefs mount notification hooks. */ struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name) { struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name)); return d_hash_and_lookup(sb->s_root, &dir); } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); int rpc_pipefs_init_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); sn->gssd_dummy = rpc_mkpipe_data(&gssd_dummy_pipe_ops, 0); if (IS_ERR(sn->gssd_dummy)) return PTR_ERR(sn->gssd_dummy); mutex_init(&sn->pipefs_sb_lock); sn->pipe_version = -1; return 0; } void rpc_pipefs_exit_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); rpc_destroy_pipe_data(sn->gssd_dummy); } /* * This call will be used for per network namespace operations calls. * Note: Function will be returned with pipefs_sb_lock taken if superblock was * found. This lock have to be released by rpc_put_sb_net() when all operations * will be completed. */ struct super_block *rpc_get_sb_net(const struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_lock(&sn->pipefs_sb_lock); if (sn->pipefs_sb) return sn->pipefs_sb; mutex_unlock(&sn->pipefs_sb_lock); return NULL; } EXPORT_SYMBOL_GPL(rpc_get_sb_net); void rpc_put_sb_net(const struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); WARN_ON(sn->pipefs_sb == NULL); mutex_unlock(&sn->pipefs_sb_lock); } EXPORT_SYMBOL_GPL(rpc_put_sb_net); static const struct rpc_filelist gssd_dummy_clnt_dir[] = { [0] = { .name = "clntXX", .mode = S_IFDIR | 0555, }, }; static ssize_t dummy_downcall(struct file *filp, const char __user *src, size_t len) { return -EINVAL; } static const struct rpc_pipe_ops gssd_dummy_pipe_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = dummy_downcall, }; /* * Here we present a bogus "info" file to keep rpc.gssd happy. We don't expect * that it will ever use this info to handle an upcall, but rpc.gssd expects * that this file will be there and have a certain format. */ static int rpc_dummy_info_show(struct seq_file *m, void *v) { seq_printf(m, "RPC server: %s\n", utsname()->nodename); seq_printf(m, "service: foo (1) version 0\n"); seq_printf(m, "address: 127.0.0.1\n"); seq_printf(m, "protocol: tcp\n"); seq_printf(m, "port: 0\n"); return 0; } DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info); static const struct rpc_filelist gssd_dummy_info_file[] = { [0] = { .name = "info", .i_fop = &rpc_dummy_info_fops, .mode = S_IFREG | 0400, }, }; /** * rpc_gssd_dummy_populate - create a dummy gssd pipe * @root: root of the rpc_pipefs filesystem * @pipe_data: pipe data created when netns is initialized * * Create a dummy set of directories and a pipe that gssd can hold open to * indicate that it is up and running. */ static struct dentry * rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) { int ret = 0; struct dentry *gssd_dentry; struct dentry *clnt_dentry = NULL; struct dentry *pipe_dentry = NULL; struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name, strlen(files[RPCAUTH_gssd].name)); /* We should never get this far if "gssd" doesn't exist */ gssd_dentry = d_hash_and_lookup(root, &q); if (!gssd_dentry) return ERR_PTR(-ENOENT); ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL); if (ret) { pipe_dentry = ERR_PTR(ret); goto out; } q.name = gssd_dummy_clnt_dir[0].name; q.len = strlen(gssd_dummy_clnt_dir[0].name); clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); if (!clnt_dentry) { __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(-ENOENT); goto out; } ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL); if (ret) { __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(ret); goto out; } pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); if (IS_ERR(pipe_dentry)) { __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1); __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); } out: dput(clnt_dentry); dput(gssd_dentry); return pipe_dentry; } static void rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) { struct dentry *clnt_dir = pipe_dentry->d_parent; struct dentry *gssd_dir = clnt_dir->d_parent; dget(pipe_dentry); __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); dput(pipe_dentry); } static int rpc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dentry *root, *gssd_dentry; struct net *net = sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; sb->s_d_op = &simple_dentry_operations; sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | 0555); sb->s_root = root = d_make_root(inode); if (!root) return -ENOMEM; if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy); if (IS_ERR(gssd_dentry)) { __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); return PTR_ERR(gssd_dentry); } dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n", net->ns.inum, NET_NAME(net)); mutex_lock(&sn->pipefs_sb_lock); sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, sb); if (err) goto err_depopulate; mutex_unlock(&sn->pipefs_sb_lock); return 0; err_depopulate: rpc_gssd_dummy_depopulate(gssd_dentry); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); sn->pipefs_sb = NULL; __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); mutex_unlock(&sn->pipefs_sb_lock); return err; } bool gssd_running(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe = sn->gssd_dummy; return pipe->nreaders || pipe->nwriters; } EXPORT_SYMBOL_GPL(gssd_running); static int rpc_fs_get_tree(struct fs_context *fc) { return get_tree_keyed(fc, rpc_fill_super, get_net(fc->net_ns)); } static void rpc_fs_free_fc(struct fs_context *fc) { if (fc->s_fs_info) put_net(fc->s_fs_info); } static const struct fs_context_operations rpc_fs_context_ops = { .free = rpc_fs_free_fc, .get_tree = rpc_fs_get_tree, }; static int rpc_init_fs_context(struct fs_context *fc) { put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(fc->net_ns->user_ns); fc->ops = &rpc_fs_context_ops; return 0; } static void rpc_kill_sb(struct super_block *sb) { struct net *net = sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_lock(&sn->pipefs_sb_lock); if (sn->pipefs_sb != sb) { mutex_unlock(&sn->pipefs_sb_lock); goto out; } sn->pipefs_sb = NULL; dprintk("RPC: sending pipefs UMOUNT notification for net %x%s\n", net->ns.inum, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); mutex_unlock(&sn->pipefs_sb_lock); out: kill_litter_super(sb); put_net(net); } static struct file_system_type rpc_pipe_fs_type = { .owner = THIS_MODULE, .name = "rpc_pipefs", .init_fs_context = rpc_init_fs_context, .kill_sb = rpc_kill_sb, }; MODULE_ALIAS_FS("rpc_pipefs"); MODULE_ALIAS("rpc_pipefs"); static void init_once(void *foo) { struct rpc_inode *rpci = (struct rpc_inode *) foo; inode_init_once(&rpci->vfs_inode); rpci->private = NULL; rpci->pipe = NULL; init_waitqueue_head(&rpci->waitq); } int register_rpc_pipefs(void) { int err; rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; err = rpc_clients_notifier_register(); if (err) goto err_notifier; err = register_filesystem(&rpc_pipe_fs_type); if (err) goto err_register; return 0; err_register: rpc_clients_notifier_unregister(); err_notifier: kmem_cache_destroy(rpc_inode_cachep); return err; } void unregister_rpc_pipefs(void) { rpc_clients_notifier_unregister(); unregister_filesystem(&rpc_pipe_fs_type); kmem_cache_destroy(rpc_inode_cachep); } |
71 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | // SPDX-License-Identifier: GPL-2.0-or-later /* * xpress_decompress.c - A decompressor for the XPRESS compression format * (Huffman variant), which can be used in "System Compressed" files. This is * based on the code from wimlib. * * Copyright (C) 2015 Eric Biggers */ #include "decompress_common.h" #include "lib.h" #define XPRESS_NUM_SYMBOLS 512 #define XPRESS_MAX_CODEWORD_LEN 15 #define XPRESS_MIN_MATCH_LEN 3 /* This value is chosen for fast decompression. */ #define XPRESS_TABLEBITS 12 /* Reusable heap-allocated memory for XPRESS decompression */ struct xpress_decompressor { /* The Huffman decoding table */ u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; /* An array that maps symbols to codeword lengths */ u8 lens[XPRESS_NUM_SYMBOLS]; /* Temporary space for make_huffman_decode_table() */ u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) + XPRESS_NUM_SYMBOLS]; }; /* * xpress_allocate_decompressor - Allocate an XPRESS decompressor * * Return the pointer to the decompressor on success, or return NULL and set * errno on failure. */ struct xpress_decompressor *xpress_allocate_decompressor(void) { return kmalloc(sizeof(struct xpress_decompressor), GFP_NOFS); } /* * xpress_decompress - Decompress a buffer of XPRESS-compressed data * * @decompressor: A decompressor that was allocated with * xpress_allocate_decompressor() * @compressed_data: The buffer of data to decompress * @compressed_size: Number of bytes of compressed data * @uncompressed_data: The buffer in which to store the decompressed data * @uncompressed_size: The number of bytes the data decompresses into * * Return 0 on success, or return -1 and set errno on failure. */ int xpress_decompress(struct xpress_decompressor *decompressor, const void *compressed_data, size_t compressed_size, void *uncompressed_data, size_t uncompressed_size) { struct xpress_decompressor *d = decompressor; const u8 * const in_begin = compressed_data; u8 * const out_begin = uncompressed_data; u8 *out_next = out_begin; u8 * const out_end = out_begin + uncompressed_size; struct input_bitstream is; u32 i; /* Read the Huffman codeword lengths. */ if (compressed_size < XPRESS_NUM_SYMBOLS / 2) goto invalid; for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { d->lens[i*2 + 0] = in_begin[i] & 0xF; d->lens[i*2 + 1] = in_begin[i] >> 4; } /* Build a decoding table for the Huffman code. */ if (make_huffman_decode_table(d->decode_table, XPRESS_NUM_SYMBOLS, XPRESS_TABLEBITS, d->lens, XPRESS_MAX_CODEWORD_LEN, d->working_space)) goto invalid; /* Decode the matches and literals. */ init_input_bitstream(&is, in_begin + XPRESS_NUM_SYMBOLS / 2, compressed_size - XPRESS_NUM_SYMBOLS / 2); while (out_next != out_end) { u32 sym; u32 log2_offset; u32 length; u32 offset; sym = read_huffsym(&is, d->decode_table, XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); if (sym < 256) { /* Literal */ *out_next++ = sym; } else { /* Match */ length = sym & 0xf; log2_offset = (sym >> 4) & 0xf; bitstream_ensure_bits(&is, 16); offset = ((u32)1 << log2_offset) | bitstream_pop_bits(&is, log2_offset); if (length == 0xf) { length += bitstream_read_byte(&is); if (length == 0xf + 0xff) length = bitstream_read_u16(&is); } length += XPRESS_MIN_MATCH_LEN; if (offset > (size_t)(out_next - out_begin)) goto invalid; if (length > (size_t)(out_end - out_next)) goto invalid; out_next = lz_copy(out_next, length, offset, out_end, XPRESS_MIN_MATCH_LEN); } } return 0; invalid: return -1; } /* * xpress_free_decompressor - Free an XPRESS decompressor * * @decompressor: A decompressor that was allocated with * xpress_allocate_decompressor(), or NULL. */ void xpress_free_decompressor(struct xpress_decompressor *decompressor) { kfree(decompressor); } |
46 315 280 280 276 280 230 250 247 62 50 40 78 30 30 25 30 98 99 20 78 20 12 1 1 98 18 72 18 89 90 34 25 4 13 13 15 89 27 34 303 33 27 17 239 29 2 3 240 261 261 260 46 215 31 2 32 33 244 244 244 56 188 31 6 27 27 32 19 225 225 148 148 19 147 42 106 18 19 148 42 106 18 10 138 138 138 148 148 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/unicode.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Handler routines for unicode strings */ #include <linux/types.h> #include <linux/nls.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" /* Fold the case of a unicode char, given the 16 bit value */ /* Returns folded char, or 0 if ignorable */ static inline u16 case_fold(u16 c) { u16 tmp; tmp = hfsplus_case_fold_table[c >> 8]; if (tmp) tmp = hfsplus_case_fold_table[tmp + (c & 0xff)]; else tmp = c; return tmp; } /* Compare unicode strings, return values like normal strcmp */ int hfsplus_strcasecmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2) { u16 len1, len2, c1, c2; const hfsplus_unichr *p1, *p2; len1 = be16_to_cpu(s1->length); len2 = be16_to_cpu(s2->length); p1 = s1->unicode; p2 = s2->unicode; while (1) { c1 = c2 = 0; while (len1 && !c1) { c1 = case_fold(be16_to_cpu(*p1)); p1++; len1--; } while (len2 && !c2) { c2 = case_fold(be16_to_cpu(*p2)); p2++; len2--; } if (c1 != c2) return (c1 < c2) ? -1 : 1; if (!c1 && !c2) return 0; } } /* Compare names as a sequence of 16-bit unsigned integers */ int hfsplus_strcmp(const struct hfsplus_unistr *s1, const struct hfsplus_unistr *s2) { u16 len1, len2, c1, c2; const hfsplus_unichr *p1, *p2; int len; len1 = be16_to_cpu(s1->length); len2 = be16_to_cpu(s2->length); p1 = s1->unicode; p2 = s2->unicode; for (len = min(len1, len2); len > 0; len--) { c1 = be16_to_cpu(*p1); c2 = be16_to_cpu(*p2); if (c1 != c2) return c1 < c2 ? -1 : 1; p1++; p2++; } return len1 < len2 ? -1 : len1 > len2 ? 1 : 0; } #define Hangul_SBase 0xac00 #define Hangul_LBase 0x1100 #define Hangul_VBase 0x1161 #define Hangul_TBase 0x11a7 #define Hangul_SCount 11172 #define Hangul_LCount 19 #define Hangul_VCount 21 #define Hangul_TCount 28 #define Hangul_NCount (Hangul_VCount * Hangul_TCount) static u16 *hfsplus_compose_lookup(u16 *p, u16 cc) { int i, s, e; s = 1; e = p[1]; if (!e || cc < p[s * 2] || cc > p[e * 2]) return NULL; do { i = (s + e) / 2; if (cc > p[i * 2]) s = i + 1; else if (cc < p[i * 2]) e = i - 1; else return hfsplus_compose_table + p[i * 2 + 1]; } while (s <= e); return NULL; } int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) { const hfsplus_unichr *ip; struct nls_table *nls = HFSPLUS_SB(sb)->nls; u8 *op; u16 cc, c0, c1; u16 *ce1, *ce2; int i, len, ustrlen, res, compose; op = astr; ip = ustr->unicode; ustrlen = be16_to_cpu(ustr->length); len = *len_p; ce1 = NULL; compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (ustrlen > 0) { c0 = be16_to_cpu(*ip++); ustrlen--; /* search for single decomposed char */ if (likely(compose)) ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0); if (ce1) cc = ce1[0]; else cc = 0; if (cc) { /* start of a possibly decomposed Hangul char */ if (cc != 0xffff) goto done; if (!ustrlen) goto same; c1 = be16_to_cpu(*ip) - Hangul_VBase; if (c1 < Hangul_VCount) { /* compose the Hangul char */ cc = (c0 - Hangul_LBase) * Hangul_VCount; cc = (cc + c1) * Hangul_TCount; cc += Hangul_SBase; ip++; ustrlen--; if (!ustrlen) goto done; c1 = be16_to_cpu(*ip) - Hangul_TBase; if (c1 > 0 && c1 < Hangul_TCount) { cc += c1; ip++; ustrlen--; } goto done; } } while (1) { /* main loop for common case of not composed chars */ if (!ustrlen) goto same; c1 = be16_to_cpu(*ip); if (likely(compose)) ce1 = hfsplus_compose_lookup( hfsplus_compose_table, c1); if (ce1) break; switch (c0) { case 0: c0 = 0x2400; break; case '/': c0 = ':'; break; } res = nls->uni2char(c0, op, len); if (res < 0) { if (res == -ENAMETOOLONG) goto out; *op = '?'; res = 1; } op += res; len -= res; c0 = c1; ip++; ustrlen--; } ce2 = hfsplus_compose_lookup(ce1, c0); if (ce2) { i = 1; while (i < ustrlen) { ce1 = hfsplus_compose_lookup(ce2, be16_to_cpu(ip[i])); if (!ce1) break; i++; ce2 = ce1; } cc = ce2[0]; if (cc) { ip += i; ustrlen -= i; goto done; } } same: switch (c0) { case 0: cc = 0x2400; break; case '/': cc = ':'; break; default: cc = c0; } done: res = nls->uni2char(cc, op, len); if (res < 0) { if (res == -ENAMETOOLONG) goto out; *op = '?'; res = 1; } op += res; len -= res; } res = 0; out: *len_p = (char *)op - astr; return res; } /* * Convert one or more ASCII characters into a single unicode character. * Returns the number of ASCII characters corresponding to the unicode char. */ static inline int asc2unichar(struct super_block *sb, const char *astr, int len, wchar_t *uc) { int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc); if (size <= 0) { *uc = '?'; size = 1; } switch (*uc) { case 0x2400: *uc = 0; break; case ':': *uc = '/'; break; } return size; } /* Decomposes a non-Hangul unicode character. */ static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size) { int off; off = hfsplus_decompose_table[(uc >> 12) & 0xf]; if (off == 0 || off == 0xffff) return NULL; off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)]; if (!off) return NULL; off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)]; if (!off) return NULL; off = hfsplus_decompose_table[off + (uc & 0xf)]; *size = off & 3; if (*size == 0) return NULL; return hfsplus_decompose_table + (off / 4); } /* * Try to decompose a unicode character as Hangul. Return 0 if @uc is not * precomposed Hangul, otherwise return the length of the decomposition. * * This function was adapted from sample code from the Unicode Standard * Annex #15: Unicode Normalization Forms, version 3.2.0. * * Copyright (C) 1991-2018 Unicode, Inc. All rights reserved. Distributed * under the Terms of Use in http://www.unicode.org/copyright.html. */ static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result) { int index; int l, v, t; index = uc - Hangul_SBase; if (index < 0 || index >= Hangul_SCount) return 0; l = Hangul_LBase + index / Hangul_NCount; v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount; t = Hangul_TBase + index % Hangul_TCount; result[0] = l; result[1] = v; if (t != Hangul_TBase) { result[2] = t; return 3; } return 2; } /* Decomposes a single unicode character. */ static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer) { u16 *result; /* Hangul is handled separately */ result = hangul_buffer; *size = hfsplus_try_decompose_hangul(uc, result); if (*size == 0) result = hfsplus_decompose_nonhangul(uc, size); return result; } int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len) { int size, dsize, decompose; u16 *dstr, outlen = 0; wchar_t c; u16 dhangul[3]; decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (outlen < max_unistr_len && len > 0) { size = asc2unichar(sb, astr, len, &c); if (decompose) dstr = decompose_unichar(c, &dsize, dhangul); else dstr = NULL; if (dstr) { if (outlen + dsize > max_unistr_len) break; do { ustr->unicode[outlen++] = cpu_to_be16(*dstr++); } while (--dsize > 0); } else ustr->unicode[outlen++] = cpu_to_be16(c); astr += size; len -= size; } ustr->length = cpu_to_be16(outlen); if (len > 0) return -ENAMETOOLONG; return 0; } /* * Hash a string to an integer as appropriate for the HFS+ filesystem. * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) { struct super_block *sb = dentry->d_sb; const char *astr; const u16 *dstr; int casefold, decompose, size, len; unsigned long hash; wchar_t c; u16 c2; u16 dhangul[3]; casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); hash = init_name_hash(dentry); astr = str->name; len = str->len; while (len > 0) { int dsize; size = asc2unichar(sb, astr, len, &c); astr += size; len -= size; if (decompose) dstr = decompose_unichar(c, &dsize, dhangul); else dstr = NULL; if (dstr) { do { c2 = *dstr++; if (casefold) c2 = case_fold(c2); if (!casefold || c2) hash = partial_name_hash(c2, hash); } while (--dsize > 0); } else { c2 = c; if (casefold) c2 = case_fold(c2); if (!casefold || c2) hash = partial_name_hash(c2, hash); } } str->hash = end_name_hash(hash); return 0; } /* * Compare strings with HFS+ filename ordering. * Composed unicode characters are decomposed and case-folding is performed * if the appropriate bits are (un)set on the superblock. */ int hfsplus_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct super_block *sb = dentry->d_sb; int casefold, decompose, size; int dsize1, dsize2, len1, len2; const u16 *dstr1, *dstr2; const char *astr1, *astr2; u16 c1, c2; wchar_t c; u16 dhangul_1[3], dhangul_2[3]; casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); astr1 = str; len1 = len; astr2 = name->name; len2 = name->len; dsize1 = dsize2 = 0; dstr1 = dstr2 = NULL; while (len1 > 0 && len2 > 0) { if (!dsize1) { size = asc2unichar(sb, astr1, len1, &c); astr1 += size; len1 -= size; if (decompose) dstr1 = decompose_unichar(c, &dsize1, dhangul_1); if (!decompose || !dstr1) { c1 = c; dstr1 = &c1; dsize1 = 1; } } if (!dsize2) { size = asc2unichar(sb, astr2, len2, &c); astr2 += size; len2 -= size; if (decompose) dstr2 = decompose_unichar(c, &dsize2, dhangul_2); if (!decompose || !dstr2) { c2 = c; dstr2 = &c2; dsize2 = 1; } } c1 = *dstr1; c2 = *dstr2; if (casefold) { c1 = case_fold(c1); if (!c1) { dstr1++; dsize1--; continue; } c2 = case_fold(c2); if (!c2) { dstr2++; dsize2--; continue; } } if (c1 < c2) return -1; else if (c1 > c2) return 1; dstr1++; dsize1--; dstr2++; dsize2--; } if (len1 < len2) return -1; if (len1 > len2) return 1; return 0; } |
2368 1360 189 984 151 1008 2434 2325 2307 2413 344 78 2324 66 721 12 91 20 530 584 584 213 13 10 76 153 394 12 384 181 81 137 5 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 | // SPDX-License-Identifier: GPL-2.0 /* * Convert integer string representation to an integer. * If an integer doesn't fit into specified type, -E is returned. * * Integer starts with optional sign. * kstrtou*() functions do not accept sign "-". * * Radix 0 means autodetection: leading "0x" implies radix 16, * leading "0" implies radix 8, otherwise radix is 10. * Autodetection hints work after optional sign, but not before. * * If -E is returned, result is not touched. */ #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/kstrtox.h> #include <linux/math64.h> #include <linux/types.h> #include <linux/uaccess.h> #include "kstrtox.h" noinline const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) { if (*base == 0) { if (s[0] == '0') { if (_tolower(s[1]) == 'x' && isxdigit(s[2])) *base = 16; else *base = 8; } else *base = 10; } if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x') s += 2; return s; } /* * Convert non-negative integer string representation in explicitly given radix * to an integer. A maximum of max_chars characters will be converted. * * Return number of characters consumed maybe or-ed with overflow bit. * If overflow occurs, result integer (incorrect) is still returned. * * Don't you dare use this function. */ noinline unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned long long *p, size_t max_chars) { unsigned long long res; unsigned int rv; res = 0; rv = 0; while (max_chars--) { unsigned int c = *s; unsigned int lc = _tolower(c); unsigned int val; if ('0' <= c && c <= '9') val = c - '0'; else if ('a' <= lc && lc <= 'f') val = lc - 'a' + 10; else break; if (val >= base) break; /* * Check for overflow only if we are within range of * it in the max base we support (16) */ if (unlikely(res & (~0ull << 60))) { if (res > div_u64(ULLONG_MAX - val, base)) rv |= KSTRTOX_OVERFLOW; } res = res * base + val; rv++; s++; } *p = res; return rv; } noinline unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p) { return _parse_integer_limit(s, base, p, INT_MAX); } static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res) { unsigned long long _res; unsigned int rv; s = _parse_integer_fixup_radix(s, &base); rv = _parse_integer(s, base, &_res); if (rv & KSTRTOX_OVERFLOW) return -ERANGE; if (rv == 0) return -EINVAL; s += rv; if (*s == '\n') s++; if (*s) return -EINVAL; *res = _res; return 0; } /** * kstrtoull - convert a string to an unsigned long long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoull(). Return code must be checked. */ noinline int kstrtoull(const char *s, unsigned int base, unsigned long long *res) { if (s[0] == '+') s++; return _kstrtoull(s, base, res); } EXPORT_SYMBOL(kstrtoull); /** * kstrtoll - convert a string to a long long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign or a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoll(). Return code must be checked. */ noinline int kstrtoll(const char *s, unsigned int base, long long *res) { unsigned long long tmp; int rv; if (s[0] == '-') { rv = _kstrtoull(s + 1, base, &tmp); if (rv < 0) return rv; if ((long long)-tmp > 0) return -ERANGE; *res = -tmp; } else { rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if ((long long)tmp < 0) return -ERANGE; *res = tmp; } return 0; } EXPORT_SYMBOL(kstrtoll); /* Internal, do not use. */ int _kstrtoul(const char *s, unsigned int base, unsigned long *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (unsigned long)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(_kstrtoul); /* Internal, do not use. */ int _kstrtol(const char *s, unsigned int base, long *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (long)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(_kstrtol); /** * kstrtouint - convert a string to an unsigned int * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoul(). Return code must be checked. */ noinline int kstrtouint(const char *s, unsigned int base, unsigned int *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (unsigned int)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtouint); /** * kstrtoint - convert a string to an int * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign or a minus sign. * @base: The number base to use. The maximum supported base is 16. If base is * given as 0, then the base of the string is automatically detected with the * conventional semantics - If it begins with 0x the number will be parsed as a * hexadecimal (case insensitive), if it otherwise begins with 0, it will be * parsed as an octal number. Otherwise it will be parsed as a decimal. * @res: Where to write the result of the conversion on success. * * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtol(). Return code must be checked. */ noinline int kstrtoint(const char *s, unsigned int base, int *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (int)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtoint); noinline int kstrtou16(const char *s, unsigned int base, u16 *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (u16)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtou16); noinline int kstrtos16(const char *s, unsigned int base, s16 *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (s16)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtos16); noinline int kstrtou8(const char *s, unsigned int base, u8 *res) { unsigned long long tmp; int rv; rv = kstrtoull(s, base, &tmp); if (rv < 0) return rv; if (tmp != (u8)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtou8); noinline int kstrtos8(const char *s, unsigned int base, s8 *res) { long long tmp; int rv; rv = kstrtoll(s, base, &tmp); if (rv < 0) return rv; if (tmp != (s8)tmp) return -ERANGE; *res = tmp; return 0; } EXPORT_SYMBOL(kstrtos8); /** * kstrtobool - convert common user inputs into boolean values * @s: input string * @res: result * * This routine returns 0 iff the first character is one of 'YyTt1NnFf0', or * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value * pointed to by res is updated upon finding a match. */ noinline int kstrtobool(const char *s, bool *res) { if (!s) return -EINVAL; switch (s[0]) { case 'y': case 'Y': case 't': case 'T': case '1': *res = true; return 0; case 'n': case 'N': case 'f': case 'F': case '0': *res = false; return 0; case 'o': case 'O': switch (s[1]) { case 'n': case 'N': *res = true; return 0; case 'f': case 'F': *res = false; return 0; default: break; } break; default: break; } return -EINVAL; } EXPORT_SYMBOL(kstrtobool); /* * Since "base" would be a nonsense argument, this open-codes the * _from_user helper instead of using the helper macro below. */ int kstrtobool_from_user(const char __user *s, size_t count, bool *res) { /* Longest string needed to differentiate, newline, terminator */ char buf[4]; count = min(count, sizeof(buf) - 1); if (copy_from_user(buf, s, count)) return -EFAULT; buf[count] = '\0'; return kstrtobool(buf, res); } EXPORT_SYMBOL(kstrtobool_from_user); #define kstrto_from_user(f, g, type) \ int f(const char __user *s, size_t count, unsigned int base, type *res) \ { \ /* sign, base 2 representation, newline, terminator */ \ char buf[1 + sizeof(type) * 8 + 1 + 1]; \ \ count = min(count, sizeof(buf) - 1); \ if (copy_from_user(buf, s, count)) \ return -EFAULT; \ buf[count] = '\0'; \ return g(buf, base, res); \ } \ EXPORT_SYMBOL(f) kstrto_from_user(kstrtoull_from_user, kstrtoull, unsigned long long); kstrto_from_user(kstrtoll_from_user, kstrtoll, long long); kstrto_from_user(kstrtoul_from_user, kstrtoul, unsigned long); kstrto_from_user(kstrtol_from_user, kstrtol, long); kstrto_from_user(kstrtouint_from_user, kstrtouint, unsigned int); kstrto_from_user(kstrtoint_from_user, kstrtoint, int); kstrto_from_user(kstrtou16_from_user, kstrtou16, u16); kstrto_from_user(kstrtos16_from_user, kstrtos16, s16); kstrto_from_user(kstrtou8_from_user, kstrtou8, u8); kstrto_from_user(kstrtos8_from_user, kstrtos8, s8); |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_AEAD_H #define _CRYPTO_INTERNAL_AEAD_H #include <crypto/aead.h> #include <crypto/algapi.h> #include <linux/stddef.h> #include <linux/types.h> struct rtattr; struct aead_instance { void (*free)(struct aead_instance *inst); union { struct { char head[offsetof(struct aead_alg, base)]; struct crypto_instance base; } s; struct aead_alg alg; }; }; struct crypto_aead_spawn { struct crypto_spawn base; }; struct aead_queue { struct crypto_queue base; }; static inline void *crypto_aead_ctx(struct crypto_aead *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline void *crypto_aead_ctx_dma(struct crypto_aead *tfm) { return crypto_tfm_ctx_dma(&tfm->base); } static inline struct crypto_instance *aead_crypto_instance( struct aead_instance *inst) { return container_of(&inst->alg.base, struct crypto_instance, alg); } static inline struct aead_instance *aead_instance(struct crypto_instance *inst) { return container_of(&inst->alg, struct aead_instance, alg.base); } static inline struct aead_instance *aead_alg_instance(struct crypto_aead *aead) { return aead_instance(crypto_tfm_alg_instance(&aead->base)); } static inline void *aead_instance_ctx(struct aead_instance *inst) { return crypto_instance_ctx(aead_crypto_instance(inst)); } static inline void *aead_request_ctx(struct aead_request *req) { return req->__ctx; } static inline void *aead_request_ctx_dma(struct aead_request *req) { unsigned int align = crypto_dma_align(); if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(aead_request_ctx(req), align); } static inline void aead_request_complete(struct aead_request *req, int err) { crypto_request_complete(&req->base, err); } static inline u32 aead_request_flags(struct aead_request *req) { return req->base.flags; } static inline struct aead_request *aead_request_cast( struct crypto_async_request *req) { return container_of(req, struct aead_request, base); } int crypto_grab_aead(struct crypto_aead_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_aead(struct crypto_aead_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct aead_alg *crypto_spawn_aead_alg( struct crypto_aead_spawn *spawn) { return container_of(spawn->base.alg, struct aead_alg, base); } static inline struct crypto_aead *crypto_spawn_aead( struct crypto_aead_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void crypto_aead_set_reqsize(struct crypto_aead *aead, unsigned int reqsize) { aead->reqsize = reqsize; } static inline void crypto_aead_set_reqsize_dma(struct crypto_aead *aead, unsigned int reqsize) { reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); aead->reqsize = reqsize; } static inline void aead_init_queue(struct aead_queue *queue, unsigned int max_qlen) { crypto_init_queue(&queue->base, max_qlen); } static inline unsigned int crypto_aead_alg_chunksize(struct aead_alg *alg) { return alg->chunksize; } /** * crypto_aead_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CCM. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_aead_chunksize(struct crypto_aead *tfm) { return crypto_aead_alg_chunksize(crypto_aead_alg(tfm)); } int crypto_register_aead(struct aead_alg *alg); void crypto_unregister_aead(struct aead_alg *alg); int crypto_register_aeads(struct aead_alg *algs, int count); void crypto_unregister_aeads(struct aead_alg *algs, int count); int aead_register_instance(struct crypto_template *tmpl, struct aead_instance *inst); #endif /* _CRYPTO_INTERNAL_AEAD_H */ |
5 3 5 3 12 3 3 3 3 3 2 2 3 3 3 3 2 2 5 5 5 5 5 5 2 1 5 5 6 6 1 2 1 1 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 | // SPDX-License-Identifier: GPL-2.0-only /* * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. * * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> * Suresh B Siddha <suresh.b.siddha@intel.com> * * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. * * Basic principles: * * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and * the kernel to set one of a handful of 'caching type' attributes for physical * memory ranges: uncached, write-combining, write-through, write-protected, * and the most commonly used and default attribute: write-back caching. * * PAT support supersedes and augments MTRR support in a compatible fashion: MTRR is * a hardware interface to enumerate a limited number of physical memory ranges * and set their caching attributes explicitly, programmed into the CPU via MSRs. * Even modern CPUs have MTRRs enabled - but these are typically not touched * by the kernel or by user-space (such as the X server), we rely on PAT for any * additional cache attribute logic. * * PAT doesn't work via explicit memory ranges, but uses page table entries to add * cache attribute information to the mapped memory range: there's 3 bits used, * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). * * ( There's a metric ton of finer details, such as compatibility with CPU quirks * that only support 4 types of PAT entries, and interaction with MTRRs, see * below for details. ) */ #include <linux/seq_file.h> #include <linux/memblock.h> #include <linux/debugfs.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/pfn_t.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/rbtree.h> #include <asm/cacheflush.h> #include <asm/cacheinfo.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> #include <asm/fcntl.h> #include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/page.h> #include <asm/msr.h> #include <asm/memtype.h> #include <asm/io.h> #include "memtype.h" #include "../mm_internal.h" #undef pr_fmt #define pr_fmt(fmt) "" fmt static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); static u64 __ro_after_init pat_msr_val; /* * PAT support is enabled by default, but can be disabled for * various user-requested or hardware-forced reasons: */ static void __init pat_disable(const char *msg_reason) { if (pat_disabled) return; pat_disabled = true; pr_info("x86/PAT: %s\n", msg_reason); memory_caching_control &= ~CACHE_PAT; } static int __init nopat(char *str) { pat_disable("PAT support disabled via boot option."); return 0; } early_param("nopat", nopat); bool pat_enabled(void) { return !pat_disabled; } EXPORT_SYMBOL_GPL(pat_enabled); int pat_debug_enable; static int __init pat_debug_setup(char *str) { pat_debug_enable = 1; return 1; } __setup("debugpat", pat_debug_setup); #ifdef CONFIG_X86_PAT /* * X86 PAT uses page flags arch_1 and uncached together to keep track of * memory type of pages that have backing page struct. * * X86 PAT supports 4 different memory types: * - _PAGE_CACHE_MODE_WB * - _PAGE_CACHE_MODE_WC * - _PAGE_CACHE_MODE_UC_MINUS * - _PAGE_CACHE_MODE_WT * * _PAGE_CACHE_MODE_WB is the default type. */ #define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) #define _PGMT_UC_MINUS (1UL << PG_uncached) #define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) static inline enum page_cache_mode get_page_memtype(struct page *pg) { unsigned long pg_flags = pg->flags & _PGMT_MASK; if (pg_flags == _PGMT_WB) return _PAGE_CACHE_MODE_WB; else if (pg_flags == _PGMT_WC) return _PAGE_CACHE_MODE_WC; else if (pg_flags == _PGMT_UC_MINUS) return _PAGE_CACHE_MODE_UC_MINUS; else return _PAGE_CACHE_MODE_WT; } static inline void set_page_memtype(struct page *pg, enum page_cache_mode memtype) { unsigned long memtype_flags; unsigned long old_flags; unsigned long new_flags; switch (memtype) { case _PAGE_CACHE_MODE_WC: memtype_flags = _PGMT_WC; break; case _PAGE_CACHE_MODE_UC_MINUS: memtype_flags = _PGMT_UC_MINUS; break; case _PAGE_CACHE_MODE_WT: memtype_flags = _PGMT_WT; break; case _PAGE_CACHE_MODE_WB: default: memtype_flags = _PGMT_WB; break; } old_flags = READ_ONCE(pg->flags); do { new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); } #else static inline enum page_cache_mode get_page_memtype(struct page *pg) { return -1; } static inline void set_page_memtype(struct page *pg, enum page_cache_mode memtype) { } #endif enum { PAT_UC = 0, /* uncached */ PAT_WC = 1, /* Write combining */ PAT_WT = 4, /* Write Through */ PAT_WP = 5, /* Write Protected */ PAT_WB = 6, /* Write Back (default) */ PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ }; #define CM(c) (_PAGE_CACHE_MODE_ ## c) static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, char *msg) { enum page_cache_mode cache; char *cache_mode; switch (pat_val) { case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; default: cache = CM(WB); cache_mode = "WB "; break; } memcpy(msg, cache_mode, 4); return cache; } #undef CM /* * Update the cache mode to pgprot translation tables according to PAT * configuration. * Using lower indices is preferred, so we start with highest index. */ static void __init init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; int i; pat_msg[32] = 0; for (i = 7; i >= 0; i--) { cache = pat_get_cache_mode((pat >> (i * 8)) & 7, pat_msg + 4 * i); update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); } void pat_cpu_init(void) { if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. */ panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); } wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); __flush_tlb_all(); } /** * pat_bp_init - Initialize the PAT MSR value and PAT table * * This function initializes PAT MSR value and PAT table with an OS-defined * value to enable additional cache attributes, WC, WT and WP. * * This function prepares the calls of pat_cpu_init() via cache_cpu_init() * on all CPUs. */ void __init pat_bp_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; #define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) if (!IS_ENABLED(CONFIG_X86_PAT)) pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); if (!cpu_feature_enabled(X86_FEATURE_PAT)) pat_disable("PAT not supported by the CPU."); else rdmsrl(MSR_IA32_CR_PAT, pat_msr_val); if (!pat_msr_val) { pat_disable("PAT support disabled by the firmware."); /* * No PAT. Emulate the PAT table that corresponds to the two * cache bits, PWT (Write Through) and PCD (Cache Disable). * This setup is also the same as the BIOS default setup. * * PTE encoding: * * PCD * |PWT PAT * || slot * 00 0 WB : _PAGE_CACHE_MODE_WB * 01 1 WT : _PAGE_CACHE_MODE_WT * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 11 3 UC : _PAGE_CACHE_MODE_UC * * NOTE: When WC or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); } /* * Xen PV doesn't allow to set PAT MSR, but all cache modes are * supported. */ if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) { init_cache_modes(pat_msr_val); return; } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the * upper four entries unusable. To be on the safe side, we don't * use those. * * PTE encoding: * PAT * |PCD * ||PWT PAT * ||| slot * 000 0 WB : _PAGE_CACHE_MODE_WB * 001 1 WC : _PAGE_CACHE_MODE_WC * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * PAT bit unused * * NOTE: When WT or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); } else { /* * Full PAT support. We put WT in slot 7 to improve * robustness in the presence of errata that might cause * the high PAT bit to be ignored. This way, a buggy slot 7 * access will hit slot 3, and slot 3 is UC, so at worst * we lose performance without causing a correctness issue. * Pentium 4 erratum N46 is an example for such an erratum, * although we try not to use PAT at all on affected CPUs. * * PTE encoding: * PAT * |PCD * ||PWT PAT * ||| slot * 000 0 WB : _PAGE_CACHE_MODE_WB * 001 1 WC : _PAGE_CACHE_MODE_WC * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * * The reserved slots are unused, but mapped to their * corresponding types in the presence of PAT errata. */ pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); } memory_caching_control |= CACHE_PAT; init_cache_modes(pat_msr_val); #undef PAT } static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ /* * Does intersection of PAT memory type and MTRR memory type and returns * the resulting memory type as PAT understands it. * (Type in pat and mtrr will not have same value) * The intersection is based on "Effective Memory Type" tables in IA-32 * SDM vol 3a */ static unsigned long pat_x_mtrr_type(u64 start, u64 end, enum page_cache_mode req_type) { /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ if (req_type == _PAGE_CACHE_MODE_WB) { u8 mtrr_type, uniform; mtrr_type = mtrr_type_lookup(start, end, &uniform); if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; return _PAGE_CACHE_MODE_WB; } return req_type; } struct pagerange_state { unsigned long cur_pfn; int ram; int not_ram; }; static int pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) { struct pagerange_state *state = arg; state->not_ram |= initial_pfn > state->cur_pfn; state->ram |= total_nr_pages > 0; state->cur_pfn = initial_pfn + total_nr_pages; return state->ram && state->not_ram; } static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { int ret = 0; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; struct pagerange_state state = {start_pfn, 0, 0}; /* * For legacy reasons, physical address range in the legacy ISA * region is tracked as non-RAM. This will allow users of * /dev/mem to map portions of legacy ISA region, even when * some of those portions are listed(or not even listed) with * different e820 types(RAM/reserved/..) */ if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; if (start_pfn < end_pfn) { ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &state, pagerange_is_ram_callback); } return (ret > 0) ? -1 : (state.ram ? 1 : 0); } /* * For RAM pages, we use page flags to mark the pages with appropriate type. * The page flags are limited to four types, WB (default), WC, WT and UC-. * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting * a new memory type is only allowed for a page mapped with the default WB * type. * * Here we do two passes: * - Find the memtype of all the pages in the range, look for any conflicts. * - In case of no conflicts, set the new memtype for pages in the range. */ static int reserve_ram_pages_type(u64 start, u64 end, enum page_cache_mode req_type, enum page_cache_mode *new_type) { struct page *page; u64 pfn; if (req_type == _PAGE_CACHE_MODE_WP) { if (new_type) *new_type = _PAGE_CACHE_MODE_UC_MINUS; return -EINVAL; } if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); req_type = _PAGE_CACHE_MODE_UC_MINUS; } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { enum page_cache_mode type; page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != _PAGE_CACHE_MODE_WB) { pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; return -EBUSY; } } if (new_type) *new_type = req_type; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); set_page_memtype(page, req_type); } return 0; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; u64 pfn; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); set_page_memtype(page, _PAGE_CACHE_MODE_WB); } return 0; } static u64 sanitize_phys(u64 address) { /* * When changing the memtype for pages containing poison allow * for a "decoy" virtual address (bit 63 clear) passed to * set_memory_X(). __pa() on a "decoy" address results in a * physical address with bit 63 set. * * Decoy addresses are not present for 32-bit builds, see * set_mce_nospec(). */ if (IS_ENABLED(CONFIG_X86_64)) return address & __PHYSICAL_MASK; return address; } /* * req_type typically has one of the: * - _PAGE_CACHE_MODE_WB * - _PAGE_CACHE_MODE_WC * - _PAGE_CACHE_MODE_UC_MINUS * - _PAGE_CACHE_MODE_UC * - _PAGE_CACHE_MODE_WT * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, enum page_cache_mode *new_type) { struct memtype *entry_new; enum page_cache_mode actual_type; int is_range_ram; int err = 0; start = sanitize_phys(start); /* * The end address passed into this function is exclusive, but * sanitize_phys() expects an inclusive address. */ end = sanitize_phys(end - 1) + 1; if (start >= end) { WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, start, end - 1, cattr_name(req_type)); return -EINVAL; } if (!pat_enabled()) { /* This is identical to page table setting without PAT */ if (new_type) *new_type = req_type; return 0; } /* Low ISA region is always mapped WB in page table. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_MODE_WB; return 0; } /* * Call mtrr_lookup to get the type hint. This is an * optimization for /dev/mem mmap'ers into WB memory (BIOS * tools and ACPI tools). Use WB request for WB memory and use * UC_MINUS otherwise. */ actual_type = pat_x_mtrr_type(start, end, req_type); if (new_type) *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) { err = reserve_ram_pages_type(start, end, req_type, new_type); return err; } else if (is_range_ram < 0) { return -EINVAL; } entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!entry_new) return -ENOMEM; entry_new->start = start; entry_new->end = end; entry_new->type = actual_type; spin_lock(&memtype_lock); err = memtype_check_insert(entry_new, new_type); if (err) { pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", start, end - 1, cattr_name(entry_new->type), cattr_name(req_type)); kfree(entry_new); spin_unlock(&memtype_lock); return err; } spin_unlock(&memtype_lock); dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; } int memtype_free(u64 start, u64 end) { int is_range_ram; struct memtype *entry_old; if (!pat_enabled()) return 0; start = sanitize_phys(start); end = sanitize_phys(end); /* Low ISA region is always mapped WB. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) return free_ram_pages_type(start, end); if (is_range_ram < 0) return -EINVAL; spin_lock(&memtype_lock); entry_old = memtype_erase(start, end); spin_unlock(&memtype_lock); if (IS_ERR(entry_old)) { pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, start, end - 1); return -EINVAL; } kfree(entry_old); dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); return 0; } /** * lookup_memtype - Looks up the memory type for a physical address * @paddr: physical address of which memory type needs to be looked up * * Only to be called when PAT is enabled * * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS * or _PAGE_CACHE_MODE_WT. */ static enum page_cache_mode lookup_memtype(u64 paddr) { enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; struct memtype *entry; if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { struct page *page; page = pfn_to_page(paddr >> PAGE_SHIFT); return get_page_memtype(page); } spin_lock(&memtype_lock); entry = memtype_lookup(paddr); if (entry != NULL) rettype = entry->type; else rettype = _PAGE_CACHE_MODE_UC_MINUS; spin_unlock(&memtype_lock); return rettype; } /** * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type * of @pfn cannot be overridden by UC MTRR memory type. * * Only to be called when PAT is enabled. * * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. * Returns false in other cases. */ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) { enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); return cm == _PAGE_CACHE_MODE_UC || cm == _PAGE_CACHE_MODE_UC_MINUS || cm == _PAGE_CACHE_MODE_WC; } EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); /** * memtype_reserve_io - Request a memory type mapping for a region of memory * @start: start (physical address) of the region * @end: end (physical address) of the region * @type: A pointer to memtype, with requested type. On success, requested * or any other compatible type that was available for the region is returned * * On success, returns 0 * On failure, returns non-zero */ int memtype_reserve_io(resource_size_t start, resource_size_t end, enum page_cache_mode *type) { resource_size_t size = end - start; enum page_cache_mode req_type = *type; enum page_cache_mode new_type; int ret; WARN_ON_ONCE(iomem_map_sanity_check(start, size)); ret = memtype_reserve(start, end, req_type, &new_type); if (ret) goto out_err; if (!is_new_memtype_allowed(start, size, req_type, new_type)) goto out_free; if (memtype_kernel_map_sync(start, size, new_type) < 0) goto out_free; *type = new_type; return 0; out_free: memtype_free(start, end); ret = -EBUSY; out_err: return ret; } /** * memtype_free_io - Release a memory type mapping for a region of memory * @start: start (physical address) of the region * @end: end (physical address) of the region */ void memtype_free_io(resource_size_t start, resource_size_t end) { memtype_free(start, end); } #ifdef CONFIG_X86_PAT int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) { enum page_cache_mode type = _PAGE_CACHE_MODE_WC; return memtype_reserve_io(start, start + size, &type); } EXPORT_SYMBOL(arch_io_reserve_memtype_wc); void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) { memtype_free_io(start, start + size); } EXPORT_SYMBOL(arch_io_free_memtype_wc); #endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) vma_prot = pgprot_decrypted(vma_prot); return vma_prot; } #ifdef CONFIG_STRICT_DEVMEM /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; } #else /* This check is needed to avoid cache aliasing when PAT is enabled */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { u64 from = ((u64)pfn) << PAGE_SHIFT; u64 to = from + size; u64 cursor = from; if (!pat_enabled()) return 1; while (cursor < to) { if (!devmem_is_allowed(pfn)) return 0; cursor += PAGE_SIZE; pfn++; } return 1; } #endif /* CONFIG_STRICT_DEVMEM */ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; } /* * Change the memory type for the physical address range in kernel identity * mapping space if that range is a part of identity map. */ int memtype_kernel_map_sync(u64 base, unsigned long size, enum page_cache_mode pcm) { unsigned long id_sz; if (base > __pa(high_memory-1)) return 0; /* * Some areas in the middle of the kernel identity range * are not mapped, for example the PCI space. */ if (!page_is_ram(base >> PAGE_SHIFT)) return 0; id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(pcm), base, (unsigned long long)(base + size-1)); return -EINVAL; } return 0; } /* * Internal interface to reserve a range of physical memory with prot. * Reserved non RAM regions only and after successful memtype_reserve, * this func also keeps identity mapping (if any) in sync with this new prot. */ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, int strict_prot) { int is_ram = 0; int ret; enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); enum page_cache_mode pcm = want_pcm; is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* * reserve_pfn_range() for RAM pages. We do not refcount to keep * track of number of mappings of RAM pages. We can assert that * the type requested matches the type of first page in the range. */ if (is_ram) { if (!pat_enabled()) return 0; pcm = lookup_memtype(paddr); if (want_pcm != pcm) { pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), cattr_name(pcm)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); } return 0; } ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); if (ret) return ret; if (pcm != want_pcm) { if (strict_prot || !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { memtype_free(paddr, paddr + size); pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), cattr_name(pcm)); return -EINVAL; } /* * We allow returning different type than the one requested in * non strict case. */ *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); } if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { memtype_free(paddr, paddr + size); return -EINVAL; } return 0; } /* * Internal interface to free a range of physical memory. * Frees non RAM regions only. */ static void free_pfn_range(u64 paddr, unsigned long size) { int is_ram; is_ram = pat_pagerange_is_ram(paddr, paddr + size); if (is_ram == 0) memtype_free(paddr, paddr + size); } static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, pgprot_t *pgprot) { unsigned long prot; VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT)); /* * We need the starting PFN and cachemode used for track_pfn_remap() * that covered the whole VMA. For most mappings, we can obtain that * information from the page tables. For COW mappings, we might now * suddenly have anon folios mapped and follow_phys() will fail. * * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to * detect the PFN. If we need the cachemode as well, we're out of luck * for now and have to fail fork(). */ if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) { if (pgprot) *pgprot = __pgprot(prot); return 0; } if (is_cow_mapping(vma->vm_flags)) { if (pgprot) return -EINVAL; *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return 0; } WARN_ON_ONCE(1); return -EINVAL; } /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. */ int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; if (vma->vm_flags & VM_PAT) { if (get_pat_info(vma, &paddr, &pgprot)) return -EINVAL; /* reserve the whole chunk covered by vma. */ return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } return 0; } /* * prot is passed in as a parameter for the new mapping. If the vma has * a linear pfn mapping for the entire range, or no vma is provided, * reserve the entire pfn + size range with single reserve_pfn_range * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ if (!vma || (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); if (ret == 0 && vma) vm_flags_set(vma, VM_PAT); return ret; } if (!pat_enabled()) return 0; /* * For anything smaller than the vma size we set prot based on the * lookup. */ pcm = lookup_memtype(paddr); /* Check memtype for the remaining pages */ while (size > PAGE_SIZE) { size -= PAGE_SIZE; paddr += PAGE_SIZE; if (pcm != lookup_memtype(paddr)) return -EINVAL; } *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; } void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { enum page_cache_mode pcm; if (!pat_enabled()) return; /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); } /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { resource_size_t paddr; if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ paddr = (resource_size_t)pfn << PAGE_SHIFT; if (!paddr && !size) { if (get_pat_info(vma, &paddr, NULL)) return; size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); if (vma) { if (mm_wr_locked) vm_flags_clear(vma, VM_PAT); else __vm_flags_mod(vma, 0, VM_PAT); } } /* * untrack_pfn_clear is called if the following situation fits: * * 1) while mremapping a pfnmap for a new region, with the old vma after * its pfnmap page table has been removed. The new vma has a new pfnmap * to the same pfn & cache type with VM_PAT set. * 2) while duplicating vm area, the new vma fails to copy the pgtable from * old vma. */ void untrack_pfn_clear(struct vm_area_struct *vma) { vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) { return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); } EXPORT_SYMBOL_GPL(pgprot_writecombine); pgprot_t pgprot_writethrough(pgprot_t prot) { return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WT)); } EXPORT_SYMBOL_GPL(pgprot_writethrough); #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* * We are allocating a temporary printout-entry to be passed * between seq_start()/next() and seq_show(): */ static struct memtype *memtype_get_idx(loff_t pos) { struct memtype *entry_print; int ret; entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!entry_print) return NULL; spin_lock(&memtype_lock); ret = memtype_copy_nth_element(entry_print, pos); spin_unlock(&memtype_lock); /* Free it on error: */ if (ret) { kfree(entry_print); return NULL; } return entry_print; } static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos == 0) { ++*pos; seq_puts(seq, "PAT memtype list:\n"); } return memtype_get_idx(*pos); } static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { kfree(v); ++*pos; return memtype_get_idx(*pos); } static void memtype_seq_stop(struct seq_file *seq, void *v) { kfree(v); } static int memtype_seq_show(struct seq_file *seq, void *v) { struct memtype *entry_print = (struct memtype *)v; seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", entry_print->start, entry_print->end, cattr_name(entry_print->type)); return 0; } static const struct seq_operations memtype_seq_ops = { .start = memtype_seq_start, .next = memtype_seq_next, .stop = memtype_seq_stop, .show = memtype_seq_show, }; static int memtype_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &memtype_seq_ops); } static const struct file_operations memtype_fops = { .open = memtype_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static int __init pat_memtype_list_init(void) { if (pat_enabled()) { debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, NULL, &memtype_fops); } return 0; } late_initcall(pat_memtype_list_init); #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ |
47 8 40 40 10 2 11 43 1 3 1 1 1 2 1 1 1 1 7 90 113 7 107 30 2 2 2 11 10 15 1 27 14 44 7 36 2 1 40 21 7 14 18 3 4 26 14 46 2 21 22 204 8 197 6 1 15 5 5 1 5 7 4 20 3 23 23 6 3 3 16 4 13 5 12 4 13 24 24 19 7 24 14 7 74 2 1 51 26 1 70 76 1 1 54 25 7 7 14 1 13 13 13 25 8 16 2 18 32 9 21 29 5 48 2 56 5 53 48 28 10 17 24 55 2 1 1 91 8 82 3 1 1 82 46 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 | // SPDX-License-Identifier: GPL-2.0+ /* * 2002-10-15 Posix Clocks & timers * by George Anzinger george@mvista.com * Copyright (C) 2002 2003 by MontaVista Software. * * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. * Copyright (C) 2004 Boris Hu * * These are all the functions necessary to implement POSIX clocks & timers */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/mutex.h> #include <linux/sched/task.h> #include <linux/uaccess.h> #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/hash.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/export.h> #include <linux/hashtable.h> #include <linux/compat.h> #include <linux/nospec.h> #include <linux/time_namespace.h> #include "timekeeping.h" #include "posix-timers.h" static struct kmem_cache *posix_timers_cache; /* * Timers are managed in a hash table for lockless lookup. The hash key is * constructed from current::signal and the timer ID and the timer is * matched against current::signal and the timer ID when walking the hash * bucket list. * * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ ({ struct k_itimer *__timr; \ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ __timr; \ }) static int hash(struct signal_struct *sig, unsigned int nr) { return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); } static struct k_itimer *__posix_timers_find(struct hlist_head *head, struct signal_struct *sig, timer_t id) { struct k_itimer *timer; hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; } return NULL; } static struct k_itimer *posix_timer_by_id(timer_t id) { struct signal_struct *sig = current->signal; struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; return __posix_timers_find(head, sig, id); } static int posix_timer_add(struct k_itimer *timer) { struct signal_struct *sig = current->signal; struct hlist_head *head; unsigned int cnt, id; /* * FIXME: Replace this by a per signal struct xarray once there is * a plan to handle the resulting CRIU regression gracefully. */ for (cnt = 0; cnt <= INT_MAX; cnt++) { spin_lock(&hash_lock); id = sig->next_posix_timer_id; /* Write the next ID back. Clamp it to the positive space */ sig->next_posix_timer_id = (id + 1) & INT_MAX; head = &posix_timers_hashtable[hash(sig, id)]; if (!__posix_timers_find(head, sig, id)) { hlist_add_head_rcu(&timer->t_hash, head); spin_unlock(&hash_lock); return id; } spin_unlock(&hash_lock); } /* POSIX return code when no timer ID could be allocated */ return -EAGAIN; } static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { spin_unlock_irqrestore(&timr->it_lock, flags); } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; } static ktime_t posix_get_realtime_ktime(clockid_t which_clock) { return ktime_get_real(); } static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) { return do_sys_settimeofday64(tp, NULL); } static int posix_clock_realtime_adj(const clockid_t which_clock, struct __kernel_timex *t) { return do_adjtimex(t); } static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); timens_add_monotonic(tp); return 0; } static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) { return ktime_get(); } static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); timens_add_monotonic(tp); return 0; } static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_real_ts64(tp); return 0; } static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); timens_add_monotonic(tp); return 0; } static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) { *tp = ktime_to_timespec64(KTIME_LOW_RES); return 0; } static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); timens_add_boottime(tp); return 0; } static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) { return ktime_get_boottime(); } static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; } static ktime_t posix_get_tai_ktime(clockid_t which_clock) { return ktime_get_clocktai(); } static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; tp->tv_nsec = hrtimer_resolution; return 0; } static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); /* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX */ static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) { s64 sum = timr->it_overrun_last + (s64)baseval; return sum > (s64)INT_MAX ? INT_MAX : (int)sum; } static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), timr->it_interval); hrtimer_restart(timer); } /* * This function is called from the signal delivery code if * info->si_sys_private is not zero, which indicates that the timer has to * be rearmed. Restart the timer and update info::si_overrun. */ void posixtimer_rearm(struct kernel_siginfo *info) { struct k_itimer *timr; unsigned long flags; timr = lock_timer(info->si_tid, &flags); if (!timr) return; if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { timr->kclock->timer_rearm(timr); timr->it_active = 1; timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1LL; ++timr->it_requeue_pending; info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); } unlock_timer(timr, flags); } int posix_timer_event(struct k_itimer *timr, int si_private) { enum pid_type type; int ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). * * If dequeue_signal() sees the "right" value of * si_sys_private it calls posixtimer_rearm(). * We re-queue ->sigq and drop ->it_lock(). * posixtimer_rearm() locks the timer * and re-schedules it while ->sigq is pending. * Not really bad, but not that we want. */ timr->sigq->info.si_sys_private = si_private; type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; ret = send_sigqueue(timr->sigq, timr->it_pid, type); /* If we failed to send the signal the timer stops. */ return ret > 0; } /* * This function gets called when a POSIX.1b interval timer expires from * the HRTIMER interrupt (soft interrupt on RT kernels). * * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI * based timers. */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { enum hrtimer_restart ret = HRTIMER_NORESTART; struct k_itimer *timr; unsigned long flags; int si_private = 0; timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); timr->it_active = 0; if (timr->it_interval != 0) si_private = ++timr->it_requeue_pending; if (posix_timer_event(timr, si_private)) { /* * The signal was not queued due to SIG_IGN. As a * consequence the timer is not going to be rearmed from * the signal delivery path. But as a real signal handler * can be installed later the timer must be rearmed here. */ if (timr->it_interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); /* * FIXME: What we really want, is to stop this * timer completely and restart it in case the * SIG_IGN is removed. This is a non trivial * change to the signal handling code. * * For now let timers with an interval less than a * jiffie expire every jiffie and recheck for a * valid signal handler. * * This avoids interrupt starvation in case of a * very small interval, which would expire the * timer immediately again. * * Moving now ahead of time by one jiffie tricks * hrtimer_forward() to expire the timer later, * while it still maintains the overrun accuracy * for the price of a slight inconsistency in the * timer_gettime() case. This is at least better * than a timer storm. * * Only required when high resolution timers are * enabled as the periodic tick based timers are * automatically aligned to the next tick. */ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) { ktime_t kj = TICK_NSEC; if (timr->it_interval < kj) now = ktime_add(now, kj); } timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; timr->it_active = 1; } } unlock_timer(timr, flags); return ret; } static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); struct task_struct *rtn; switch (event->sigev_notify) { case SIGEV_SIGNAL | SIGEV_THREAD_ID: pid = find_vpid(event->sigev_notify_thread_id); rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; fallthrough; case SIGEV_SIGNAL: case SIGEV_THREAD: if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) return NULL; fallthrough; case SIGEV_NONE: return pid; default: return NULL; } } static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); return NULL; } clear_siginfo(&tmr->sigq->info); return tmr; } static void k_itimer_rcu_free(struct rcu_head *head) { struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); kmem_cache_free(posix_timers_cache, tmr); } static void posix_timer_free(struct k_itimer *tmr) { put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); call_rcu(&tmr->rcu, k_itimer_rcu_free); } static void posix_timer_unhash_and_free(struct k_itimer *tmr) { spin_lock(&hash_lock); hlist_del_rcu(&tmr->t_hash); spin_unlock(&hash_lock); posix_timer_free(tmr); } static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); return 0; } /* Create a POSIX.1b interval timer. */ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; if (!kc) return -EINVAL; if (!kc->timer_create) return -EOPNOTSUPP; new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) return -EAGAIN; spin_lock_init(&new_timer->it_lock); /* * Add the timer to the hash table. The timer is not yet valid * because new_timer::it_signal is still NULL. The timer id is also * not yet visible to user space. */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { posix_timer_free(new_timer); return new_timer_id; } new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { rcu_read_lock(); new_timer->it_pid = get_pid(good_sigevent(event)); rcu_read_unlock(); if (!new_timer->it_pid) { error = -EINVAL; goto out; } new_timer->it_sigev_notify = event->sigev_notify; new_timer->sigq->info.si_signo = event->sigev_signo; new_timer->sigq->info.si_value = event->sigev_value; } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq->info.si_signo = SIGALRM; memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); new_timer->sigq->info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; goto out; } /* * After succesful copy out, the timer ID is visible to user space * now but not yet valid because new_timer::signal is still NULL. * * Complete the initialization with the clock specific create * callback. */ error = kc->timer_create(new_timer); if (error) goto out; spin_lock_irq(¤t->sighand->siglock); /* This makes the timer valid in the hash table */ WRITE_ONCE(new_timer->it_signal, current->signal); list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); /* * After unlocking sighand::siglock @new_timer is subject to * concurrent removal and cannot be touched anymore */ return 0; out: posix_timer_unhash_and_free(new_timer); return error; } SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { if (timer_event_spec) { sigevent_t event; if (copy_from_user(&event, timer_event_spec, sizeof (event))) return -EFAULT; return do_timer_create(which_clock, &event, created_timer_id); } return do_timer_create(which_clock, NULL, created_timer_id); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, struct compat_sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { if (timer_event_spec) { sigevent_t event; if (get_compat_sigevent(&event, timer_event_spec)) return -EFAULT; return do_timer_create(which_clock, &event, created_timer_id); } return do_timer_create(which_clock, NULL, created_timer_id); } #endif static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* * timer_t could be any type >= int and we want to make sure any * @timer_id outside positive int range fails lookup. */ if ((unsigned long long)timer_id > INT_MAX) return NULL; /* * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where * timr::it_signal == NULL. timer::it_signal is only set after the * rest of the initialization succeeded. * * Timer destruction happens in steps: * 1) Set timr::it_signal to NULL with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Call RCU for removal after the grace period * * Holding rcu_read_lock() accross the lookup ensures that * the timer cannot be freed. * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id * can't change, but timr::it_signal becomes NULL during * destruction. */ rcu_read_lock(); timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ if (timr->it_signal == current->signal) { rcu_read_unlock(); return timr; } spin_unlock_irqrestore(&timr->it_lock, *flags); } rcu_read_unlock(); return NULL; } static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) { struct hrtimer *timer = &timr->it.real.timer; return __hrtimer_expires_remaining_adjusted(timer, now); } static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) { struct hrtimer *timer = &timr->it.real.timer; return hrtimer_forward(timer, now, timr->it_interval); } /* * Get the time remaining on a POSIX.1b interval timer. * * Two issues to handle here: * * 1) The timer has a requeue pending. The return value must appear as * if the timer has been requeued right now. * * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued * into the hrtimer queue and therefore never expired. Emulate expiry * here taking #1 into account. */ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); } else if (!timr->it_active) { /* * SIGEV_NONE oneshot timers are never queued and therefore * timr->it_active is always false. The check below * vs. remaining time will handle this case. * * For all other timers there is nothing to update here, so * return. */ if (!sig_none) return; } now = kc->clock_get_ktime(timr->it_clock); /* * If this is an interval timer and either has requeue pending or * is a SIGEV_NONE timer move the expiry time forward by intervals, * so expiry is > now. */ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); /* * As @now is retrieved before a possible timer_forward() and * cannot be reevaluated by the compiler @remaining is based on the * same @now value. Therefore @remaining is consistent vs. @now. * * Consequently all interval timers, i.e. @iv > 0, cannot have a * remaining time <= 0 because timer_forward() guarantees to move * them forward so that the next timer expiry is > @now. */ if (remaining <= 0) { /* * A single shot SIGEV_NONE timer must return 0, when it is * expired! Timers which have a real signal delivery mode * must return a remaining time greater than 0 because the * signal has not yet been delivered. */ if (!sig_none) cur_setting->it_value.tv_nsec = 1; } else { cur_setting->it_value = ktime_to_timespec64(remaining); } } static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { const struct k_clock *kc; struct k_itimer *timr; unsigned long flags; int ret = 0; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; memset(setting, 0, sizeof(*setting)); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else kc->timer_get(timr, setting); unlock_timer(timr, flags); return ret; } /* Get the time remaining on a POSIX.1b interval timer. */ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct __kernel_itimerspec __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { if (put_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { if (put_old_itimerspec32(&cur_setting, setting)) ret = -EFAULT; } return ret; } #endif /** * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer * @timer_id: The timer ID which identifies the timer * * The "overrun count" of a timer is one plus the number of expiration * intervals which have elapsed between the first expiry, which queues the * signal and the actual signal delivery. On signal delivery the "overrun * count" is calculated and cached, so it can be returned directly here. * * As this is relative to the last queued signal the returned overrun count * is meaningless outside of the signal delivery path and even there it * does not accurately reflect the current state when user space evaluates * it. * * Returns: * -EINVAL @timer_id is invalid * 1..INT_MAX The number of overruns related to the last delivered signal */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { struct k_itimer *timr; unsigned long flags; int overrun; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; overrun = timer_overrun_to_int(timr, 0); unlock_timer(timr, flags); return overrun; } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct hrtimer *timer = &timr->it.real.timer; enum hrtimer_mode mode; mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. */ if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); hrtimer_set_expires(timer, expires); if (!sigev_none) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } static int common_hrtimer_try_to_cancel(struct k_itimer *timr) { return hrtimer_try_to_cancel(&timr->it.real.timer); } static void common_timer_wait_running(struct k_itimer *timer) { hrtimer_cancel_wait_running(&timer->it.real.timer); } /* * On PREEMPT_RT this prevents priority inversion and a potential livelock * against the ksoftirqd thread in case that ksoftirqd gets preempted while * executing a hrtimer callback. * * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this * just results in a cpu_relax(). * * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this * prevents spinning on an eventually scheduled out task and a livelock * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags) { const struct k_clock *kc = READ_ONCE(timer->kclock); timer_t timer_id = READ_ONCE(timer->it_id); /* Prevent kfree(timer) after dropping the lock */ rcu_read_lock(); unlock_timer(timer, *flags); /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ if (!WARN_ON_ONCE(!kc->timer_wait_running)) kc->timer_wait_running(timer); rcu_read_unlock(); /* Relock the timer. It might be not longer hashed. */ return lock_timer(timer_id, flags); } /* Set a POSIX.1b interval timer. */ int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) { const struct k_clock *kc = timr->kclock; bool sigev_none; ktime_t expires; if (old_setting) common_timer_get(timr, old_setting); /* Prevent rearming by clearing the interval */ timr->it_interval = 0; /* * Careful here. On SMP systems the timer expiry function could be * active and spinning on timr->it_lock. */ if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; timr->it_active = 0; timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0; /* Switch off the timer when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); if (flags & TIMER_ABSTIME) expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; return 0; } static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { const struct k_clock *kc; struct k_itimer *timr; unsigned long flags; int error = 0; if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); timr = lock_timer(timer_id, &flags); retry: if (!timr) return -EINVAL; kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); if (error == TIMER_RETRY) { // We already got the old time... old_spec64 = NULL; /* Unlocks and relocks the timer if it still exists */ timr = timer_wait_running(timr, &flags); goto retry; } unlock_timer(timr, flags); return error; } /* Set a POSIX.1b interval timer */ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct __kernel_itimerspec __user *, new_setting, struct __kernel_itimerspec __user *, old_setting) { struct itimerspec64 new_spec, old_spec, *rtn; int error = 0; if (!new_setting) return -EINVAL; if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; rtn = old_setting ? &old_spec : NULL; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { if (put_itimerspec64(&old_spec, old_setting)) error = -EFAULT; } return error; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, struct old_itimerspec32 __user *, new, struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; int error = 0; if (!new) return -EINVAL; if (get_old_itimerspec32(&new_spec, new)) return -EFAULT; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { if (put_old_itimerspec32(&old_spec, old)) error = -EFAULT; } return error; } #endif int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; timer->it_active = 0; return 0; } static inline int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; return kc->timer_del(timer); } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; unsigned long flags; timer = lock_timer(timer_id, &flags); retry_delete: if (!timer) return -EINVAL; if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { /* Unlocks and relocks the timer if it still exists */ timer = timer_wait_running(timer, &flags); goto retry_delete; } spin_lock(¤t->sighand->siglock); list_del(&timer->list); spin_unlock(¤t->sighand->siglock); /* * A concurrent lookup could check timer::it_signal lockless. It * will reevaluate with timer::it_lock held and observe the NULL. */ WRITE_ONCE(timer->it_signal, NULL); unlock_timer(timer, flags); posix_timer_unhash_and_free(timer); return 0; } /* * Delete a timer if it is armed, remove it from the hash and schedule it * for RCU freeing. */ static void itimer_delete(struct k_itimer *timer) { unsigned long flags; /* * irqsave is required to make timer_wait_running() work. */ spin_lock_irqsave(&timer->it_lock, flags); retry_delete: /* * Even if the timer is not longer accessible from other tasks * it still might be armed and queued in the underlying timer * mechanism. Worse, that timer mechanism might run the expiry * function concurrently. */ if (timer_delete_hook(timer) == TIMER_RETRY) { /* * Timer is expired concurrently, prevent livelocks * and pointless spinning on RT. * * timer_wait_running() drops timer::it_lock, which opens * the possibility for another task to delete the timer. * * That's not possible here because this is invoked from * do_exit() only for the last thread of the thread group. * So no other task can access and delete that timer. */ if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) return; goto retry_delete; } list_del(&timer->list); /* * Setting timer::it_signal to NULL is technically not required * here as nothing can access the timer anymore legitimately via * the hash table. Set it to NULL nevertheless so that all deletion * paths are consistent. */ WRITE_ONCE(timer->it_signal, NULL); spin_unlock_irqrestore(&timer->it_lock, flags); posix_timer_unhash_and_free(timer); } /* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. */ void exit_itimers(struct task_struct *tsk) { struct list_head timers; struct k_itimer *tmr; if (list_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ spin_lock_irq(&tsk->sighand->siglock); list_replace_init(&tsk->signal->posix_timers, &timers); spin_unlock_irq(&tsk->sighand->siglock); /* The timers are not longer accessible via tsk::signal */ while (!list_empty(&timers)) { tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); } } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp; if (!kc || !kc->clock_set) return -EINVAL; if (get_timespec64(&new_tp, tp)) return -EFAULT; /* * Permission checks have to be done inside the clock specific * setter callback. */ return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp; int error; if (!kc) return -EINVAL; error = kc->clock_get_timespec(which_clock, &kernel_tp); if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; return error; } int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx) { const struct k_clock *kc = clockid_to_kclock(which_clock); if (!kc) return -EINVAL; if (!kc->clock_adj) return -EOPNOTSUPP; return kc->clock_adj(which_clock, ktx); } SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, struct __kernel_timex __user *, utx) { struct __kernel_timex ktx; int err; if (copy_from_user(&ktx, utx, sizeof(ktx))) return -EFAULT; err = do_clock_adjtime(which_clock, &ktx); if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; return err; } /** * sys_clock_getres - Get the resolution of a clock * @which_clock: The clock to get the resolution for * @tp: Pointer to a a user space timespec64 for storage * * POSIX defines: * * "The clock_getres() function shall return the resolution of any * clock. Clock resolutions are implementation-defined and cannot be set by * a process. If the argument res is not NULL, the resolution of the * specified clock shall be stored in the location pointed to by res. If * res is NULL, the clock resolution is not returned. If the time argument * of clock_settime() is not a multiple of res, then the value is truncated * to a multiple of res." * * Due to the various hardware constraints the real resolution can vary * wildly and even change during runtime when the underlying devices are * replaced. The kernel also can use hardware devices with different * resolutions for reading the time and for arming timers. * * The kernel therefore deviates from the POSIX spec in various aspects: * * 1) The resolution returned to user space * * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI, * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW * the kernel differentiates only two cases: * * I) Low resolution mode: * * When high resolution timers are disabled at compile or runtime * the resolution returned is nanoseconds per tick, which represents * the precision at which timers expire. * * II) High resolution mode: * * When high resolution timers are enabled the resolution returned * is always one nanosecond independent of the actual resolution of * the underlying hardware devices. * * For CLOCK_*_ALARM the actual resolution depends on system * state. When system is running the resolution is the same as the * resolution of the other clocks. During suspend the actual * resolution is the resolution of the underlying RTC device which * might be way less precise than the clockevent device used during * running state. * * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution * returned is always nanoseconds per tick. * * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution * returned is always one nanosecond under the assumption that the * underlying scheduler clock has a better resolution than nanoseconds * per tick. * * For dynamic POSIX clocks (PTP devices) the resolution returned is * always one nanosecond. * * 2) Affect on sys_clock_settime() * * The kernel does not truncate the time which is handed in to * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only * affects the presicion of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution * -EINVAL @which_clock is not a valid clock ID * -EFAULT Copying the resolution to @tp faulted * -ENODEV Dynamic POSIX clock is not backed by a device * -EOPNOTSUPP Dynamic POSIX clock does not support getres() */ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp; int error; if (!kc) return -EINVAL; error = kc->clock_getres(which_clock, &rtn_tp); if (!error && tp && put_timespec64(&rtn_tp, tp)) error = -EFAULT; return error; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; if (!kc || !kc->clock_set) return -EINVAL; if (get_old_timespec32(&ts, tp)) return -EFAULT; return kc->clock_set(which_clock, &ts); } SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; int err; if (!kc) return -EINVAL; err = kc->clock_get_timespec(which_clock, &ts); if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; return err; } SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, struct old_timex32 __user *, utp) { struct __kernel_timex ktx; int err; err = get_old_timex32(&ktx, utp); if (err) return err; err = do_clock_adjtime(which_clock, &ktx); if (err >= 0 && put_old_timex32(utp, &ktx)) return -EFAULT; return err; } SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; int err; if (!kc) return -EINVAL; err = kc->clock_getres(which_clock, &ts); if (!err && tp && put_old_timespec32(&ts, tp)) return -EFAULT; return err; } #endif /* * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI */ static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } /* * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME * * Absolute nanosleeps for these clocks are time-namespace adjusted. */ static int common_nsleep_timens(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); if (flags & TIMER_ABSTIME) texp = timens_ktime_to_host(which_clock, texp); return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -EOPNOTSUPP; if (get_timespec64(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return kc->nsleep(which_clock, flags, &t); } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -EOPNOTSUPP; if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return kc->nsleep(which_clock, flags, &t); } #endif static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_realtime_timespec, .clock_get_ktime = posix_get_realtime_ktime, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_timespec, .clock_get_ktime = posix_get_monotonic_ktime, .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic_raw = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get_timespec = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get_timespec = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, .clock_get_ktime = posix_get_tai_ktime, .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, .clock_get_ktime = posix_get_boottime_ktime, .clock_get_timespec = posix_get_boottime_timespec, .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME] = &clock_realtime, [CLOCK_MONOTONIC] = &clock_monotonic, [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, [CLOCK_BOOTTIME] = &clock_boottime, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) { clockid_t idx = id; if (id < 0) { return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; } if (id >= ARRAY_SIZE(posix_clocks)) return NULL; return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } |
33 112 112 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PTRACE_H #define _LINUX_PTRACE_H #include <linux/compiler.h> /* For unlikely. */ #include <linux/sched.h> /* For struct task_struct. */ #include <linux/sched/signal.h> /* For send_sig(), same_thread_group(), etc. */ #include <linux/err.h> /* for IS_ERR_VALUE */ #include <linux/bug.h> /* For BUG_ON. */ #include <linux/pid_namespace.h> /* For task_active_pid_ns. */ #include <uapi/linux/ptrace.h> #include <linux/seccomp.h> /* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ struct syscall_info { __u64 sp; struct seccomp_data data; }; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); /* * Ptrace flags * * The owner ship rules for task->ptrace which holds the ptrace * flags is simple. When a task is running it owns it's task->ptrace * flags. When the a task is stopped the ptracer owns task->ptrace. */ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ #define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event))) #define PT_TRACESYSGOOD PT_EVENT_FLAG(0) #define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) #define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) #define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) #define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC) #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) extern long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_notify(int exit_code, unsigned long message); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) /** * ptrace_may_access - check whether the caller is permitted to access * a target task. * @task: target task * @mode: selects type of access and caller credentials * * Returns true on success, false on denial. * * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must * be set in @mode to specify whether the access was requested through * a filesystem syscall (should use effective capabilities and fsuid * of the caller) or through an explicit syscall such as * process_vm_writev or ptrace (and should use the real credentials). */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); } static inline void ptrace_unlink(struct task_struct *child) { if (unlikely(child->ptrace)) __ptrace_unlink(child); } int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long data); int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); /** * ptrace_parent - return the task that is tracing the given task * @task: task to consider * * Returns %NULL if no one is tracing @task, or the &struct task_struct * pointer to its tracer. * * Must called under rcu_read_lock(). The pointer returned might be kept * live only by RCU. During exec, this may be called with task_lock() held * on @task, still held from when check_unsafe_exec() was called. */ static inline struct task_struct *ptrace_parent(struct task_struct *task) { if (unlikely(task->ptrace)) return rcu_dereference(task->parent); return NULL; } /** * ptrace_event_enabled - test whether a ptrace event is enabled * @task: ptracee of interest * @event: %PTRACE_EVENT_* to test * * Test whether @event is enabled for ptracee @task. * * Returns %true if @event is enabled, %false otherwise. */ static inline bool ptrace_event_enabled(struct task_struct *task, int event) { return task->ptrace & PT_EVENT_FLAG(event); } /** * ptrace_event - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @message: value for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @message * to the ptrace parent. * * Called without locks. */ static inline void ptrace_event(int event, unsigned long message) { if (unlikely(ptrace_event_enabled(current, event))) { ptrace_notify((event << 8) | SIGTRAP, message); } else if (event == PTRACE_EVENT_EXEC) { /* legacy EXEC report via SIGTRAP */ if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED) send_sig(SIGTRAP, current, 0); } } /** * ptrace_event_pid - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @pid: process identifier for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the * ptrace parent's pid namespace. * * Called without locks. */ static inline void ptrace_event_pid(int event, struct pid *pid) { /* * FIXME: There's a potential race if a ptracer in a different pid * namespace than parent attaches between computing message below and * when we acquire tasklist_lock in ptrace_stop(). If this happens, * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. */ unsigned long message = 0; struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(rcu_dereference(current->parent)); if (ns) message = pid_nr_ns(pid, ns); rcu_read_unlock(); ptrace_event(event, message); } /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task * @ptrace: true if child should be ptrace'd by parent's tracer * * This is called immediately after adding @child to its parent's children * list. @ptrace is false in the normal case, and true to ptrace @child. * * Called with current's siglock and write_lock_irq(&tasklist_lock) held. */ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); } else child->ptracer_cred = NULL; } /** * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped * @task: task in %EXIT_DEAD state * * Called with write_lock(&tasklist_lock) held. */ static inline void ptrace_release_task(struct task_struct *task) { BUG_ON(!list_empty(&task->ptraced)); ptrace_unlink(task); BUG_ON(!list_empty(&task->ptrace_entry)); } #ifndef force_successful_syscall_return /* * System call handlers that, upon successful completion, need to return a * negative value should call force_successful_syscall_return() right before * returning. On architectures where the syscall convention provides for a * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly * others), this macro can be used to ensure that the error flag will not get * set. On architectures which do not support a separate error flag, the macro * is a no-op and the spurious error condition needs to be filtered out by some * other means (e.g., in user-level, by passing an extra argument to the * syscall handler, or something along those lines). */ #define force_successful_syscall_return() do { } while (0) #endif #ifndef is_syscall_success /* * On most systems we can tell if a syscall is a success based on if the retval * is an error value. On some systems like ia64 and powerpc they have different * indicators of success/failure and must define their own. */ #define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) #endif /* * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. * * These do-nothing inlines are used when the arch does not * implement single-step. The kerneldoc comments are here * to document the interface for all arch definitions. */ #ifndef arch_has_single_step /** * arch_has_single_step - does this CPU support user-mode single-step? * * If this is defined, then there must be function declarations or * inlines for user_enable_single_step() and user_disable_single_step(). * arch_has_single_step() should evaluate to nonzero iff the machine * supports instruction single-step for user mode. * It can be a constant or it can test a CPU feature bit. */ #define arch_has_single_step() (0) /** * user_enable_single_step - single-step in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_single_step() has returned nonzero. * Set @task so that when it returns to user mode, it will trap after the * next single instruction executes. If arch_has_block_step() is defined, * this must clear the effects of user_enable_block_step() too. */ static inline void user_enable_single_step(struct task_struct *task) { BUG(); /* This can never be called. */ } /** * user_disable_single_step - cancel user-mode single-step * @task: either current or a task stopped in %TASK_TRACED * * Clear @task of the effects of user_enable_single_step() and * user_enable_block_step(). This can be called whether or not either * of those was ever called on @task, and even if arch_has_single_step() * returned zero. */ static inline void user_disable_single_step(struct task_struct *task) { } #else extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); #endif /* arch_has_single_step */ #ifndef arch_has_block_step /** * arch_has_block_step - does this CPU support user-mode block-step? * * If this is defined, then there must be a function declaration or inline * for user_enable_block_step(), and arch_has_single_step() must be defined * too. arch_has_block_step() should evaluate to nonzero iff the machine * supports step-until-branch for user mode. It can be a constant or it * can test a CPU feature bit. */ #define arch_has_block_step() (0) /** * user_enable_block_step - step until branch in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_block_step() has returned nonzero, * and will never be called when single-instruction stepping is being used. * Set @task so that when it returns to user mode, it will trap after the * next branch or trap taken. */ static inline void user_enable_block_step(struct task_struct *task) { BUG(); /* This can never be called. */ } #else extern void user_enable_block_step(struct task_struct *); #endif /* arch_has_block_step */ #ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT extern void user_single_step_report(struct pt_regs *regs); #else static inline void user_single_step_report(struct pt_regs *regs) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = SI_USER; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } #endif #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called * * This is called with the siglock held, to decide whether or not it's * necessary to release the siglock and call arch_ptrace_stop(). It can be * defined to a constant if arch_ptrace_stop() is never required, or always * is. On machines where this makes sense, it should be defined to a quick * test to optimize out calling arch_ptrace_stop() when it would be * superfluous. For example, if the thread has not been back to user mode * since the last stop, the thread state might indicate that nothing needs * to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ #define arch_ptrace_stop_needed() (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory * access. The arch can have machine-specific work to be done before * ptrace stops. On ia64, register backing store gets written back to user * memory here. Since this can be costly (requires dropping the siglock), * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ #define arch_ptrace_stop() do { } while (0) #endif #ifndef current_pt_regs #define current_pt_regs() task_pt_regs(current) #endif #ifndef current_user_stack_pointer #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif #ifndef exception_ip #define exception_ip(x) instruction_pointer(x) #endif extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); /* * ptrace report for syscall entry and exit looks identical. */ static inline int ptrace_report_syscall(unsigned long message) { int ptrace = current->ptrace; int signr; if (!(ptrace & PT_PTRACED)) return 0; signr = ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0), message); /* * this isn't the same as continuing with a signal, but it will do * for normal use. strace only continues with a signal if the * stopping signal is not SIGTRAP. -brl */ if (signr) send_sig(signr, current, 1); return fatal_signal_pending(current); } /** * ptrace_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is * made. If @task ever returns to user mode after this, its register state * is unspecified, but should be something harmless like an %ENOSYS error * return. It should preserve enough information so that syscall_rollback() * can work (see asm-generic/syscall.h). * * Called without locks, just after entering kernel mode. */ static inline __must_check int ptrace_report_syscall_entry( struct pt_regs *regs) { return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** * ptrace_report_syscall_exit - task has just finished a system call * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ static inline void ptrace_report_syscall_exit(struct pt_regs *regs, int step) { if (step) user_single_step_report(regs); else ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT); } #endif |
8 2 7 16 16 16 16 16 16 16 11 11 15 38 1 6 5 22 16 3 7 7 5 9 14 6 10 13 3 16 16 16 16 16 4 4 6 7 7 1 8 2 3 3 4 4 9 9 3 3 3 3 3 6 6 6 6 3 6 5 2 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 16 4 1 2 14 14 14 1 547 1 10 1 2 4 4 13 3 6 10 21 6 22 5 10 16 16 11 6 11 3 1 2 22 22 26 11 16 2 1 8 1 1 1 42 20 2 52 9 48 1 1 11 1 8 9 1 17 1 6 1 8 1 2 3 3 21 1 5 1 4 18 1 14 11 1 22 1 73 73 1 27 26 11 11 4 3 1 16 1 4 6 2 10 1 3 6 1 16 3 2 5 11 5 2 6 15 15 1 6 11 2 13 5 20 20 3 3 2 13 5 2 1 7 19 12 4 2 13 27 27 2 2 2 1 1 7 7 1 1 17 17 154 2 1573 1395 536 398 26 154 2 4 4 4 4 14 4 14 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 | // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ NHA_OP_FLAG_DUMP_HW_STATS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_policy_del[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->id = nhi->nh_parent->id; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; info->nh_grp->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; info->nh_res_table->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nh->id, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); free_percpu(nhge->stats); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) { struct nh_grp_entry_stats *cpu_stats; cpu_stats = get_cpu_ptr(nhge->stats); u64_stats_update_begin(&cpu_stats->syncp); u64_stats_inc(&cpu_stats->packets); u64_stats_update_end(&cpu_stats->syncp); put_cpu_ptr(cpu_stats); } static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, u64 *ret_packets) { int i; *ret_packets = 0; for_each_possible_cpu(i) { struct nh_grp_entry_stats *cpu_stats; unsigned int start; u64 packets; cpu_stats = per_cpu_ptr(nhge->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); packets = u64_stats_read(&cpu_stats->packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); *ret_packets += packets; } } static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); info->id = nh->id; info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, stats, nhg->num_nh), GFP_KERNEL); if (!info->nh_grp_hw_stats) return -ENOMEM; info->nh_grp_hw_stats->num_nh = nhg->num_nh; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; } return 0; } static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) { kfree(info->nh_grp_hw_stats); } void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, unsigned int nh_idx, u64 delta_packets) { info->hw_stats_used = true; info->stats[nh_idx].packets += delta_packets; } EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); static void nh_grp_hw_stats_apply_update(struct nexthop *nh, struct nh_notifier_info *info) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; } } static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) { struct nh_notifier_info info = { .net = nh->net, }; struct net *net = nh->net; int err; if (nexthop_notifiers_is_empty(net)) { *hw_stats_used = false; return 0; } err = nh_notifier_grp_hw_stats_init(&info, nh); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, &info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ nh_grp_hw_stats_apply_update(nh, &info); *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; nh_notifier_grp_hw_stats_fini(&info); return notifier_to_errno(err); } static int nla_put_nh_group_stats_entry(struct sk_buff *skb, struct nh_grp_entry *nhge, u32 op_flags) { struct nlattr *nest; u64 packets; nh_grp_entry_stats_read(nhge, &packets); nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets + nhge->packets_hw)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, nhge->packets_hw)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nlattr *nest; bool hw_stats_used; int err; int i; if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) goto err_out; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nhg->hw_stats) { err = nh_grp_hw_stats_update(nh, &hw_stats_used); if (err) goto out; if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) goto err_out; } nest = nla_nest_start(skb, NHA_GROUP_STATS); if (!nest) goto err_out; for (i = 0; i < nhg->num_nh; i++) if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], op_flags)) goto cancel_out; nla_nest_end(skb, nest); return 0; cancel_out: nla_nest_cancel(skb, nest); err_out: err = -EMSGSIZE; out: return err; } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; int i; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { p->id = nhg->nh_entries[i].nh->id; p->weight = nhg->nh_entries[i].weight - 1; p += 1; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_STATS && (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || nla_put_nh_group_stats(skb, nh, op_flags))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags, u32 op_flags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nh, op_flags)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (nhi->fib_nhc.nhc_lwtstate && lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd1 || nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); return -EINVAL; } if (nhg[i].weight > 254) { NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } if (tb[NHA_FDB]) nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nh_grp_entry *nhge0 = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!nhge0) nhge0 = nhge; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } if (!nhge0) nhge0 = &nhg->nh_entries[0]; nh_grp_entry_stats_inc(nhge0); return nhge0->nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); nh_grp_entry_stats_inc(nhge); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { int prev_upper_bound = 0; int total = 0; int w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].stats = nhges[i].stats; new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); free_percpu(nhge->stats); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i, *tmp; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); /* ip6_del_rt removes the entry from this list hence the _safe */ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); } } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; if (WARN_ON(!num_nh)) return ERR_PTR(-EINVAL); nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].stats = netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); if (!nhg->nh_entries[i].stats) { err = -ENOMEM; nexthop_put(nhe); goto out_no_nh; } nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = entry[i].weight + 1; list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; if (cfg->nh_hw_stats) nhg->hw_stats = true; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); free_percpu(nhg->nh_entries[i].stats); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); return ERR_PTR(-EINVAL); } if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; err = nlmsg_parse(nlh, sizeof(*nhm), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) return err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); if (tb[NHA_HW_STATS_ENABLE]) cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } if (!cfg->nh_fdb && tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); goto out; } else if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } else if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); err = -ENETDOWN; goto out; } } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } if (tb[NHA_HW_STATS_ENABLE]) { NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); goto out; } err = 0; out: return err; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); if (!err) { nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); } return err; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, u32 *op_flags, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (op_flags) { if (tb[NHA_OP_FLAGS]) *op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); else *op_flags = 0; } return 0; } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); if (err) return err; nh = nexthop_find_by_id(net, id); if (!nh) return -ENOENT; remove_nexthop(net, nh, &nlinfo); return 0; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; u32 op_flags; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, op_flags); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; u32 op_flags; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; if (tb[NHA_OP_FLAGS]) filter->op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); else filter->op_flags = 0; return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); if (nh->id < s_idx) continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (!err) nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); return err; } EXPORT_SYMBOL(__unregister_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) { kfree(net->nexthop.devhash); net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, rtm_dump_nexthop, 0); rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, rtm_dump_nexthop_bucket, 0); return 0; } subsys_initcall(nexthop_init); |
6 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | // SPDX-License-Identifier: GPL-2.0-only /* * ebt_mark * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * July, 2002 * */ /* The mark target can be used in any chain, * I believe adding a mangle table just for marking is total overkill. * Marking a frame doesn't really change anything in the frame anyway. */ #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_mark_t.h> static unsigned int ebt_mark_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_mark_t_info *info = par->targinfo; int action = info->target & -16; if (action == MARK_SET_VALUE) skb->mark = info->mark; else if (action == MARK_OR_VALUE) skb->mark |= info->mark; else if (action == MARK_AND_VALUE) skb->mark &= info->mark; else skb->mark ^= info->mark; return info->target | ~EBT_VERDICT_BITS; } static int ebt_mark_tg_check(const struct xt_tgchk_param *par) { const struct ebt_mark_t_info *info = par->targinfo; int tmp; tmp = info->target | ~EBT_VERDICT_BITS; if (BASE_CHAIN && tmp == EBT_RETURN) return -EINVAL; if (ebt_invalid_target(tmp)) return -EINVAL; tmp = info->target & ~EBT_VERDICT_BITS; if (tmp != MARK_SET_VALUE && tmp != MARK_OR_VALUE && tmp != MARK_AND_VALUE && tmp != MARK_XOR_VALUE) return -EINVAL; return 0; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ebt_mark_t_info { compat_ulong_t mark; compat_uint_t target; }; static void mark_tg_compat_from_user(void *dst, const void *src) { const struct compat_ebt_mark_t_info *user = src; struct ebt_mark_t_info *kern = dst; kern->mark = user->mark; kern->target = user->target; } static int mark_tg_compat_to_user(void __user *dst, const void *src) { struct compat_ebt_mark_t_info __user *user = dst; const struct ebt_mark_t_info *kern = src; if (put_user(kern->mark, &user->mark) || put_user(kern->target, &user->target)) return -EFAULT; return 0; } #endif static struct xt_target ebt_mark_tg_reg __read_mostly = { .name = "mark", .revision = 0, .family = NFPROTO_BRIDGE, .target = ebt_mark_tg, .checkentry = ebt_mark_tg_check, .targetsize = sizeof(struct ebt_mark_t_info), #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ebt_mark_t_info), .compat_from_user = mark_tg_compat_from_user, .compat_to_user = mark_tg_compat_to_user, #endif .me = THIS_MODULE, }; static int __init ebt_mark_init(void) { return xt_register_target(&ebt_mark_tg_reg); } static void __exit ebt_mark_fini(void) { xt_unregister_target(&ebt_mark_tg_reg); } module_init(ebt_mark_init); module_exit(ebt_mark_fini); MODULE_DESCRIPTION("Ebtables: Packet mark modification"); MODULE_LICENSE("GPL"); |
438 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PIPE_FS_I_H #define _LINUX_PIPE_FS_I_H #define PIPE_DEF_BUFFERS 16 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ #define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */ #define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */ #ifdef CONFIG_WATCH_QUEUE #define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */ #endif /** * struct pipe_buffer - a linux kernel pipe buffer * @page: the page containing the data for the pipe buffer * @offset: offset of data inside the @page * @len: length of data inside the @page * @ops: operations associated with this buffer. See @pipe_buf_operations. * @flags: pipe buffer flags. See above. * @private: private data owned by the ops. **/ struct pipe_buffer { struct page *page; unsigned int offset, len; const struct pipe_buf_operations *ops; unsigned int flags; unsigned long private; }; /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter * @poll_usage: is this pipe used for epoll, which has crazy wakeups? * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; unsigned int head; unsigned int tail; unsigned int max_usage; unsigned int ring_size; unsigned int nr_accounted; unsigned int readers; unsigned int writers; unsigned int files; unsigned int r_counter; unsigned int w_counter; bool poll_usage; #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; struct user_struct *user; #ifdef CONFIG_WATCH_QUEUE struct watch_queue *watch_queue; #endif }; /* * Note on the nesting of these functions: * * ->confirm() * ->try_steal() * * That is, ->try_steal() must be called on a confirmed buffer. See below for * the meaning of each operation. Also see the kerneldoc in fs/pipe.c for the * pipe and generic variants of these hooks. */ struct pipe_buf_operations { /* * ->confirm() verifies that the data in the pipe buffer is there * and that the contents are good. If the pages in the pipe belong * to a file system, we may need to wait for IO completion in this * hook. Returns 0 for good, or a negative error value in case of * error. If not present all pages are considered good. */ int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); /* * When the contents of this pipe buffer has been completely * consumed by a reader, ->release() is called. */ void (*release)(struct pipe_inode_info *, struct pipe_buffer *); /* * Attempt to take ownership of the pipe buffer and its contents. * ->try_steal() returns %true for success, in which case the contents * of the pipe (the buf->page) is locked and now completely owned by the * caller. The page may then be transferred to a different mapping, the * most often used case is insertion into different file address space * cache. */ bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *); /* * Get a reference to the pipe buffer. */ bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_has_watch_queue - Check whether the pipe is a watch_queue, * i.e. it was created with O_NOTIFICATION_PIPE * @pipe: The pipe to check * * Return: true if pipe is a watch queue, false otherwise. */ static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) { #ifdef CONFIG_WATCH_QUEUE return pipe->watch_queue != NULL; #else return false; #endif } /** * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ static inline bool pipe_empty(unsigned int head, unsigned int tail) { return head == tail; } /** * pipe_occupancy - Return number of slots used in the pipe * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { return head - tail; } /** * pipe_full - Return true if the pipe is full * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer * @limit: The maximum amount of slots available. */ static inline bool pipe_full(unsigned int head, unsigned int tail, unsigned int limit) { return pipe_occupancy(head, tail) >= limit; } /** * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring * @pipe: The pipe to access * @slot: The slot of interest */ static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, unsigned int slot) { return &pipe->bufs[slot & (pipe->ring_size - 1)]; } /** * pipe_head_buf - Return the pipe buffer at the head of the pipe ring * @pipe: The pipe to access */ static inline struct pipe_buffer *pipe_head_buf(const struct pipe_inode_info *pipe) { return pipe_buf(pipe, pipe->head); } /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to * * Return: %true if the reference was successfully obtained. */ static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return buf->ops->get(pipe, buf); } /** * pipe_buf_release - put a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to */ static inline void pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { const struct pipe_buf_operations *ops = buf->ops; buf->ops = NULL; ops->release(pipe, buf); } /** * pipe_buf_confirm - verify contents of the pipe buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to confirm */ static inline int pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!buf->ops->confirm) return 0; return buf->ops->confirm(pipe, buf); } /** * pipe_buf_try_steal - attempt to take ownership of a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal */ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!buf->ops->try_steal) return false; return buf->ops->try_steal(pipe, buf); } static inline void pipe_discard_from(struct pipe_inode_info *pipe, unsigned int old_head) { unsigned int mask = pipe->ring_size - 1; while (pipe->head > old_head) pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]); } /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE /* Pipe lock and unlock operations */ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); /* Wait for a pipe to be readable/writable while dropping the pipe lock */ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new); bool too_many_pipe_buffers_soft(unsigned long user_bufs); bool too_many_pipe_buffers_hard(unsigned long user_bufs); bool pipe_is_unprivileged_user(void); /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); long pipe_fcntl(struct file *, unsigned int, unsigned int arg); struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); unsigned int round_pipe_size(unsigned int size); #endif |
30 5 69 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key * numbers and masks have sufficient capacity. */ #define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { return cpu_feature_enabled(X86_FEATURE_OSPKE); } /* * Try to dedicate one of the protection keys to be used as an * execute-only protection key. */ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey); static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return 0; return __arch_override_mprotect_pkey(vma, prot, pkey); } #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) #define mm_set_pkey_allocated(mm, pkey) do { \ mm_pkey_allocation_map(mm) |= (1U << pkey); \ } while (0) #define mm_set_pkey_free(mm, pkey) do { \ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ } while (0) static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned * from pkey_alloc() or pkey 0 which is allocated * implicitly when the mm is created. */ if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; /* * The exec-only pkey is set in the allocation map, but * is not available to any of the user interfaces like * mprotect_pkey(). */ if (pkey == mm->context.execute_only_pkey) return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } /* * Returns a positive, 4-bit key on success, or -1 on failure. */ static inline int mm_pkey_alloc(struct mm_struct *mm) { /* * Note: this is the one and only place we make sure * that the pkey is valid as far as the hardware is * concerned. The rest of the kernel trusts that * only good, valid pkeys come out of here. */ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); int ret; /* * Are we out of pkeys? We must handle this specially * because ffz() behavior is undefined if there are no * zeros. */ if (mm_pkey_allocation_map(mm) == all_pkeys_mask) return -1; ret = ffz(mm_pkey_allocation_map(mm)); mm_set_pkey_allocated(mm, ret); return ret; } static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; mm_set_pkey_free(mm, pkey); return 0; } static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3; return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; } #endif /*_ASM_X86_PKEYS_H */ |
2 2 2 1 1 1 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | // SPDX-License-Identifier: GPL-2.0-only /* * QNX6 file system, Linux implementation. * * Version : 1.0.0 * * History : * * 01-02-2012 by Kai Bankett (chaosman@ontika.net) : first release. * 16-02-2012 pagemap extension by Al Viro * */ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/crc32.h> #include <linux/mpage.h> #include <linux/fs_parser.h> #include <linux/fs_context.h> #include "qnx6.h" static const struct super_operations qnx6_sops; static void qnx6_put_super(struct super_block *sb); static struct inode *qnx6_alloc_inode(struct super_block *sb); static void qnx6_free_inode(struct inode *inode); static int qnx6_reconfigure(struct fs_context *fc); static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf); static int qnx6_show_options(struct seq_file *seq, struct dentry *root); static const struct super_operations qnx6_sops = { .alloc_inode = qnx6_alloc_inode, .free_inode = qnx6_free_inode, .put_super = qnx6_put_super, .statfs = qnx6_statfs, .show_options = qnx6_show_options, }; static int qnx6_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; struct qnx6_sb_info *sbi = QNX6_SB(sb); if (sbi->s_mount_opt & QNX6_MOUNT_MMI_FS) seq_puts(seq, ",mmi_fs"); return 0; } static int qnx6_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; sync_filesystem(sb); fc->sb_flags |= SB_RDONLY; return 0; } static unsigned qnx6_get_devblock(struct super_block *sb, __fs32 block) { struct qnx6_sb_info *sbi = QNX6_SB(sb); return fs32_to_cpu(sbi, block) + sbi->s_blks_off; } static unsigned qnx6_block_map(struct inode *inode, unsigned iblock); static int qnx6_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { unsigned phys; pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", inode->i_ino, (unsigned long)iblock); phys = qnx6_block_map(inode, iblock); if (phys) { /* logical block is before EOF */ map_bh(bh, inode->i_sb, phys); } return 0; } static int qnx6_check_blockptr(__fs32 ptr) { if (ptr == ~(__fs32)0) { pr_err("hit unused blockpointer.\n"); return 0; } return 1; } static int qnx6_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, qnx6_get_block); } static void qnx6_readahead(struct readahead_control *rac) { mpage_readahead(rac, qnx6_get_block); } /* * returns the block number for the no-th element in the tree * inodebits requred as there are multiple inodes in one inode block */ static unsigned qnx6_block_map(struct inode *inode, unsigned no) { struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); struct qnx6_inode_info *ei = QNX6_I(inode); unsigned block = 0; struct buffer_head *bh; __fs32 ptr; int levelptr; int ptrbits = sbi->s_ptrbits; int bitdelta; u32 mask = (1 << ptrbits) - 1; int depth = ei->di_filelevels; int i; bitdelta = ptrbits * depth; levelptr = no >> bitdelta; if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { pr_err("Requested file block number (%u) too big.", no); return 0; } block = qnx6_get_devblock(s, ei->di_block_ptr[levelptr]); for (i = 0; i < depth; i++) { bh = sb_bread(s, block); if (!bh) { pr_err("Error reading block (%u)\n", block); return 0; } bitdelta -= ptrbits; levelptr = (no >> bitdelta) & mask; ptr = ((__fs32 *)bh->b_data)[levelptr]; if (!qnx6_check_blockptr(ptr)) return 0; block = qnx6_get_devblock(s, ptr); brelse(bh); } return block; } static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct qnx6_sb_info *sbi = QNX6_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = fs32_to_cpu(sbi, sbi->sb->sb_num_blocks); buf->f_bfree = fs32_to_cpu(sbi, sbi->sb->sb_free_blocks); buf->f_files = fs32_to_cpu(sbi, sbi->sb->sb_num_inodes); buf->f_ffree = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX6_LONG_NAME_MAX; buf->f_fsid = u64_to_fsid(id); return 0; } /* * Check the root directory of the filesystem to make sure * it really _is_ a qnx6 filesystem, and to check the size * of the directory entry. */ static const char *qnx6_checkroot(struct super_block *s) { static char match_root[2][3] = {".\0\0", "..\0"}; int i, error = 0; struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; struct page *page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return "error reading root directory"; kmap(page); dir_entry = page_address(page); for (i = 0; i < 2; i++) { /* maximum 3 bytes - due to match_root limitation */ if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) error = 1; } qnx6_put_page(page); if (error) return "error reading root directory."; return NULL; } #ifdef CONFIG_QNX6FS_DEBUG void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s) { struct qnx6_sb_info *sbi = QNX6_SB(s); pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic)); pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum)); pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial)); pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags)); pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize)); pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes)); pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes)); pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks)); pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks)); pr_debug("inode_levels: %02x\n", sb->Inode.levels); } #endif enum { Opt_mmifs }; struct qnx6_context { unsigned long s_mount_opts; }; static const struct fs_parameter_spec qnx6_param_spec[] = { fsparam_flag ("mmi_fs", Opt_mmifs), {} }; static int qnx6_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct qnx6_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, qnx6_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_mmifs: ctx->s_mount_opts |= QNX6_MOUNT_MMI_FS; break; default: return -EINVAL; } return 0; } static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, int offset, int silent) { struct qnx6_sb_info *sbi = QNX6_SB(s); struct buffer_head *bh; struct qnx6_super_block *sb; /* Check the superblock signatures start with the first superblock */ bh = sb_bread(s, offset); if (!bh) { pr_err("unable to read the first superblock\n"); return NULL; } sb = (struct qnx6_super_block *)bh->b_data; if (fs32_to_cpu(sbi, sb->sb_magic) != QNX6_SUPER_MAGIC) { sbi->s_bytesex = BYTESEX_BE; if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { /* we got a big endian fs */ pr_debug("fs got different endianness.\n"); return bh; } else sbi->s_bytesex = BYTESEX_LE; if (!silent) { if (offset == 0) { pr_err("wrong signature (magic) in superblock #1.\n"); } else { pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n", offset * s->s_blocksize); } } brelse(bh); return NULL; } return bh; } static struct inode *qnx6_private_inode(struct super_block *s, struct qnx6_root_node *p); static int qnx6_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh1 = NULL, *bh2 = NULL; struct qnx6_super_block *sb1 = NULL, *sb2 = NULL; struct qnx6_sb_info *sbi; struct qnx6_context *ctx = fc->fs_private; struct inode *root; const char *errmsg; struct qnx6_sb_info *qs; int ret = -EINVAL; u64 offset; int bootblock_offset = QNX6_BOOTBLOCK_SIZE; int silent = fc->sb_flags & SB_SILENT; qs = kzalloc(sizeof(struct qnx6_sb_info), GFP_KERNEL); if (!qs) return -ENOMEM; s->s_fs_info = qs; qs->s_mount_opt = ctx->s_mount_opts; /* Superblock always is 512 Byte long */ if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { pr_err("unable to set blocksize\n"); goto outnobh; } if (qs->s_mount_opt == QNX6_MOUNT_MMI_FS) { sb1 = qnx6_mmi_fill_super(s, silent); if (sb1) goto mmi_success; else goto outnobh; } sbi = QNX6_SB(s); sbi->s_bytesex = BYTESEX_LE; /* Check the superblock signatures start with the first superblock */ bh1 = qnx6_check_first_superblock(s, bootblock_offset / QNX6_SUPERBLOCK_SIZE, silent); if (!bh1) { /* try again without bootblock offset */ bh1 = qnx6_check_first_superblock(s, 0, silent); if (!bh1) { pr_err("unable to read the first superblock\n"); goto outnobh; } /* seems that no bootblock at partition start */ bootblock_offset = 0; } sb1 = (struct qnx6_super_block *)bh1->b_data; #ifdef CONFIG_QNX6FS_DEBUG qnx6_superblock_debug(sb1, s); #endif /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { pr_err("superblock #1 checksum error\n"); goto out; } /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ brelse(bh1); bh1 = sb_bread(s, bootblock_offset >> s->s_blocksize_bits); if (!bh1) goto outnobh; sb1 = (struct qnx6_super_block *)bh1->b_data; /* calculate second superblock blocknumber */ offset = fs32_to_cpu(sbi, sb1->sb_num_blocks) + (bootblock_offset >> s->s_blocksize_bits) + (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); /* set bootblock offset */ sbi->s_blks_off = (bootblock_offset >> s->s_blocksize_bits) + (QNX6_SUPERBLOCK_AREA >> s->s_blocksize_bits); /* next the second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { pr_err("superblock #2 checksum error\n"); goto out; } if (fs64_to_cpu(sbi, sb1->sb_serial) >= fs64_to_cpu(sbi, sb2->sb_serial)) { /* superblock #1 active */ sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); pr_info("superblock #2 active\n"); } mmi_success: /* sanity check - limit maximum indirect pointer levels */ if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { pr_err("too many inode levels (max %i, sb %i)\n", QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); goto out; } if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { pr_err("too many longfilename levels (max %i, sb %i)\n", QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); goto out; } s->s_op = &qnx6_sops; s->s_magic = QNX6_SUPER_MAGIC; s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ s->s_time_min = 0; s->s_time_max = U32_MAX; /* ease the later tree level calculations */ sbi = QNX6_SB(s); sbi->s_ptrbits = ilog2(s->s_blocksize / 4); sbi->inodes = qnx6_private_inode(s, &sb1->Inode); if (!sbi->inodes) goto out; sbi->longfile = qnx6_private_inode(s, &sb1->Longfile); if (!sbi->longfile) goto out1; /* prefetch root inode */ root = qnx6_iget(s, QNX6_ROOT_INO); if (IS_ERR(root)) { pr_err("get inode failed\n"); ret = PTR_ERR(root); goto out2; } ret = -ENOMEM; s->s_root = d_make_root(root); if (!s->s_root) goto out2; ret = -EINVAL; errmsg = qnx6_checkroot(s); if (errmsg != NULL) { if (!silent) pr_err("%s\n", errmsg); goto out3; } return 0; out3: dput(s->s_root); s->s_root = NULL; out2: iput(sbi->longfile); out1: iput(sbi->inodes); out: brelse(bh1); brelse(bh2); outnobh: kfree(qs); s->s_fs_info = NULL; return ret; } static void qnx6_put_super(struct super_block *sb) { struct qnx6_sb_info *qs = QNX6_SB(sb); brelse(qs->sb_buf); iput(qs->longfile); iput(qs->inodes); kfree(qs); sb->s_fs_info = NULL; return; } static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, qnx6_get_block); } static const struct address_space_operations qnx6_aops = { .read_folio = qnx6_read_folio, .readahead = qnx6_readahead, .bmap = qnx6_bmap }; static struct inode *qnx6_private_inode(struct super_block *s, struct qnx6_root_node *p) { struct inode *inode = new_inode(s); if (inode) { struct qnx6_inode_info *ei = QNX6_I(inode); struct qnx6_sb_info *sbi = QNX6_SB(s); inode->i_size = fs64_to_cpu(sbi, p->size); memcpy(ei->di_block_ptr, p->ptr, sizeof(p->ptr)); ei->di_filelevels = p->levels; inode->i_mode = S_IFREG | S_IRUSR; /* probably wrong */ inode->i_mapping->a_ops = &qnx6_aops; } return inode; } struct inode *qnx6_iget(struct super_block *sb, unsigned ino) { struct qnx6_sb_info *sbi = QNX6_SB(sb); struct qnx6_inode_entry *raw_inode; struct inode *inode; struct qnx6_inode_info *ei; struct address_space *mapping; struct page *page; u32 n, offs; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; ei = QNX6_I(inode); inode->i_mode = 0; if (ino == 0) { pr_err("bad inode number on dev %s: %u is out of range\n", sb->s_id, ino); iget_failed(inode); return ERR_PTR(-EIO); } n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS); offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS); mapping = sbi->inodes->i_mapping; page = read_mapping_page(mapping, n, NULL); if (IS_ERR(page)) { pr_err("major problem: unable to read inode from dev %s\n", sb->s_id); iget_failed(inode); return ERR_CAST(page); } kmap(page); raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid)); inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size); inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0); inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0); inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0); /* calc blocks based on 512 byte blocksize */ inode->i_blocks = (inode->i_size + 511) >> 9; memcpy(&ei->di_block_ptr, &raw_inode->di_block_ptr, sizeof(raw_inode->di_block_ptr)); ei->di_filelevels = raw_inode->di_filelevels; if (S_ISREG(inode->i_mode)) { inode->i_fop = &generic_ro_fops; inode->i_mapping->a_ops = &qnx6_aops; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &qnx6_dir_inode_operations; inode->i_fop = &qnx6_dir_operations; inode->i_mapping->a_ops = &qnx6_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &qnx6_aops; } else init_special_inode(inode, inode->i_mode, 0); qnx6_put_page(page); unlock_new_inode(inode); return inode; } static struct kmem_cache *qnx6_inode_cachep; static struct inode *qnx6_alloc_inode(struct super_block *sb) { struct qnx6_inode_info *ei; ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void qnx6_free_inode(struct inode *inode) { kmem_cache_free(qnx6_inode_cachep, QNX6_I(inode)); } static void init_once(void *foo) { struct qnx6_inode_info *ei = (struct qnx6_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int init_inodecache(void) { qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(qnx6_inode_cachep); } static int qnx6_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, qnx6_fill_super); } static void qnx6_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations qnx6_context_ops = { .parse_param = qnx6_parse_param, .get_tree = qnx6_get_tree, .reconfigure = qnx6_reconfigure, .free = qnx6_free_fc, }; static int qnx6_init_fs_context(struct fs_context *fc) { struct qnx6_context *ctx; ctx = kzalloc(sizeof(struct qnx6_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->ops = &qnx6_context_ops; fc->fs_private = ctx; return 0; } static struct file_system_type qnx6_fs_type = { .owner = THIS_MODULE, .name = "qnx6", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = qnx6_init_fs_context, .parameters = qnx6_param_spec, }; MODULE_ALIAS_FS("qnx6"); static int __init init_qnx6_fs(void) { int err; err = init_inodecache(); if (err) return err; err = register_filesystem(&qnx6_fs_type); if (err) { destroy_inodecache(); return err; } pr_info("QNX6 filesystem 1.0.0 registered.\n"); return 0; } static void __exit exit_qnx6_fs(void) { unregister_filesystem(&qnx6_fs_type); destroy_inodecache(); } module_init(init_qnx6_fs) module_exit(exit_qnx6_fs) MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "checksum.h" #include "errcode.h" #include "super.h" #include "super-io.h" #include <linux/crc32c.h> #include <linux/crypto.h> #include <linux/xxhash.h> #include <linux/key.h> #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/algapi.h> #include <crypto/chacha.h> #include <crypto/hash.h> #include <crypto/poly1305.h> #include <crypto/skcipher.h> #include <keys/user-type.h> /* * bch2_checksum state is an abstraction of the checksum state calculated over different pages. * it features page merging without having the checksum algorithm lose its state. * for native checksum aglorithms (like crc), a default seed value will do. * for hash-like algorithms, a state needs to be stored */ struct bch2_checksum_state { union { u64 seed; struct xxh64_state h64state; }; unsigned int type; }; static void bch2_checksum_init(struct bch2_checksum_state *state) { switch (state->type) { case BCH_CSUM_none: case BCH_CSUM_crc32c: case BCH_CSUM_crc64: state->seed = 0; break; case BCH_CSUM_crc32c_nonzero: state->seed = U32_MAX; break; case BCH_CSUM_crc64_nonzero: state->seed = U64_MAX; break; case BCH_CSUM_xxhash: xxh64_reset(&state->h64state, 0); break; default: BUG(); } } static u64 bch2_checksum_final(const struct bch2_checksum_state *state) { switch (state->type) { case BCH_CSUM_none: case BCH_CSUM_crc32c: case BCH_CSUM_crc64: return state->seed; case BCH_CSUM_crc32c_nonzero: return state->seed ^ U32_MAX; case BCH_CSUM_crc64_nonzero: return state->seed ^ U64_MAX; case BCH_CSUM_xxhash: return xxh64_digest(&state->h64state); default: BUG(); } } static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) { switch (state->type) { case BCH_CSUM_none: return; case BCH_CSUM_crc32c_nonzero: case BCH_CSUM_crc32c: state->seed = crc32c(state->seed, data, len); break; case BCH_CSUM_crc64_nonzero: case BCH_CSUM_crc64: state->seed = crc64_be(state->seed, data, len); break; case BCH_CSUM_xxhash: xxh64_update(&state->h64state, data, len); break; default: BUG(); } } static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct nonce nonce, struct scatterlist *sg, size_t len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); int ret; skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ret = crypto_skcipher_encrypt(req); if (ret) pr_err("got error %i from crypto_skcipher_encrypt()", ret); return ret; } static inline int do_encrypt(struct crypto_sync_skcipher *tfm, struct nonce nonce, void *buf, size_t len) { if (!is_vmalloc_addr(buf)) { struct scatterlist sg; sg_init_table(&sg, 1); sg_set_page(&sg, is_vmalloc_addr(buf) ? vmalloc_to_page(buf) : virt_to_page(buf), len, offset_in_page(buf)); return do_encrypt_sg(tfm, nonce, &sg, len); } else { unsigned pages = buf_pages(buf, len); struct scatterlist *sg; size_t orig_len = len; int ret, i; sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); if (!sg) return -BCH_ERR_ENOMEM_do_encrypt; sg_init_table(sg, pages); for (i = 0; i < pages; i++) { unsigned offset = offset_in_page(buf); unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset); sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); buf += pg_len; len -= pg_len; } ret = do_encrypt_sg(tfm, nonce, sg, orig_len); kfree(sg); return ret; } } int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, void *buf, size_t len) { struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); int ret; ret = PTR_ERR_OR_ZERO(chacha20); if (ret) { pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); return ret; } ret = crypto_skcipher_setkey(&chacha20->base, (void *) key, sizeof(*key)); if (ret) { pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); goto err; } ret = do_encrypt(chacha20, nonce, buf, len); err: crypto_free_sync_skcipher(chacha20); return ret; } static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, struct nonce nonce) { u8 key[POLY1305_KEY_SIZE]; int ret; nonce.d[3] ^= BCH_NONCE_POLY; memset(key, 0, sizeof(key)); ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); if (ret) return ret; desc->tfm = c->poly1305; crypto_shash_init(desc); crypto_shash_update(desc, key, sizeof(key)); return 0; } struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, struct nonce nonce, const void *data, size_t len) { switch (type) { case BCH_CSUM_none: case BCH_CSUM_crc32c_nonzero: case BCH_CSUM_crc64_nonzero: case BCH_CSUM_crc32c: case BCH_CSUM_xxhash: case BCH_CSUM_crc64: { struct bch2_checksum_state state; state.type = type; bch2_checksum_init(&state); bch2_checksum_update(&state, data, len); return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } case BCH_CSUM_chacha20_poly1305_80: case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; gen_poly_key(c, desc, nonce); crypto_shash_update(desc, data, len); crypto_shash_final(desc, digest); memcpy(&ret, digest, bch_crc_bytes[type]); return ret; } default: BUG(); } } int bch2_encrypt(struct bch_fs *c, unsigned type, struct nonce nonce, void *data, size_t len) { if (!bch2_csum_type_is_encryption(type)) return 0; return do_encrypt(c->chacha20, nonce, data, len); } static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, struct nonce nonce, struct bio *bio, struct bvec_iter *iter) { struct bio_vec bv; switch (type) { case BCH_CSUM_none: return (struct bch_csum) { 0 }; case BCH_CSUM_crc32c_nonzero: case BCH_CSUM_crc64_nonzero: case BCH_CSUM_crc32c: case BCH_CSUM_xxhash: case BCH_CSUM_crc64: { struct bch2_checksum_state state; state.type = type; bch2_checksum_init(&state); #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; bch2_checksum_update(&state, p, bv.bv_len); kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); #endif return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } case BCH_CSUM_chacha20_poly1305_80: case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; gen_poly_key(c, desc, nonce); #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; crypto_shash_update(desc, p, bv.bv_len); kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) crypto_shash_update(desc, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); #endif crypto_shash_final(desc, digest); memcpy(&ret, digest, bch_crc_bytes[type]); return ret; } default: BUG(); } } struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, struct nonce nonce, struct bio *bio) { struct bvec_iter iter = bio->bi_iter; return __bch2_checksum_bio(c, type, nonce, bio, &iter); } int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, struct nonce nonce, struct bio *bio) { struct bio_vec bv; struct bvec_iter iter; struct scatterlist sgl[16], *sg = sgl; size_t bytes = 0; int ret = 0; if (!bch2_csum_type_is_encryption(type)) return 0; sg_init_table(sgl, ARRAY_SIZE(sgl)); bio_for_each_segment(bv, bio, iter) { if (sg == sgl + ARRAY_SIZE(sgl)) { sg_mark_end(sg - 1); ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); if (ret) return ret; nonce = nonce_add(nonce, bytes); bytes = 0; sg_init_table(sgl, ARRAY_SIZE(sgl)); sg = sgl; } sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); bytes += bv.bv_len; } sg_mark_end(sg - 1); return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); } struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, struct bch_csum b, size_t b_len) { struct bch2_checksum_state state; state.type = type; bch2_checksum_init(&state); state.seed = le64_to_cpu(a.lo); BUG_ON(!bch2_checksum_mergeable(type)); while (b_len) { unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE); bch2_checksum_update(&state, page_address(ZERO_PAGE(0)), page_len); b_len -= page_len; } a.lo = cpu_to_le64(bch2_checksum_final(&state)); a.lo ^= b.lo; a.hi ^= b.hi; return a; } int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, struct bversion version, struct bch_extent_crc_unpacked crc_old, struct bch_extent_crc_unpacked *crc_a, struct bch_extent_crc_unpacked *crc_b, unsigned len_a, unsigned len_b, unsigned new_csum_type) { struct bvec_iter iter = bio->bi_iter; struct nonce nonce = extent_nonce(version, crc_old); struct bch_csum merged = { 0 }; struct crc_split { struct bch_extent_crc_unpacked *crc; unsigned len; unsigned csum_type; struct bch_csum csum; } splits[3] = { { crc_a, len_a, new_csum_type, { 0 }}, { crc_b, len_b, new_csum_type, { 0 } }, { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } }, }, *i; bool mergeable = crc_old.csum_type == new_csum_type && bch2_checksum_mergeable(new_csum_type); unsigned crc_nonce = crc_old.nonce; BUG_ON(len_a + len_b > bio_sectors(bio)); BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); BUG_ON(crc_is_compressed(crc_old)); BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != bch2_csum_type_is_encryption(new_csum_type)); for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { iter.bi_size = i->len << 9; if (mergeable || i->crc) i->csum = __bch2_checksum_bio(c, i->csum_type, nonce, bio, &iter); else bio_advance_iter(bio, &iter, i->len << 9); nonce = nonce_add(nonce, i->len << 9); } if (mergeable) for (i = splits; i < splits + ARRAY_SIZE(splits); i++) merged = bch2_checksum_merge(new_csum_type, merged, i->csum, i->len << 9); else merged = bch2_checksum_bio(c, crc_old.csum_type, extent_nonce(version, crc_old), bio); if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" "expected %0llx:%0llx got %0llx:%0llx (old type ", __func__, crc_old.csum.hi, crc_old.csum.lo, merged.hi, merged.lo); bch2_prt_csum_type(&buf, crc_old.csum_type); prt_str(&buf, " new type "); bch2_prt_csum_type(&buf, new_csum_type); prt_str(&buf, ")"); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); return -EIO; } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { if (i->crc) *i->crc = (struct bch_extent_crc_unpacked) { .csum_type = i->csum_type, .compression_type = crc_old.compression_type, .compressed_size = i->len, .uncompressed_size = i->len, .offset = 0, .live_size = i->len, .nonce = crc_nonce, .csum = i->csum, }; if (bch2_csum_type_is_encryption(new_csum_type)) crc_nonce += i->len; } return 0; } /* BCH_SB_FIELD_crypt: */ static int bch2_sb_crypt_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&crypt->field), sizeof(*crypt)); return -BCH_ERR_invalid_sb_crypt; } if (BCH_CRYPT_KDF_TYPE(crypt)) { prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); return -BCH_ERR_invalid_sb_crypt; } return 0; } static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); prt_newline(out); prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); prt_newline(out); prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); prt_newline(out); prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); prt_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_crypt = { .validate = bch2_sb_crypt_validate, .to_text = bch2_sb_crypt_to_text, }; #ifdef __KERNEL__ static int __bch2_request_key(char *key_description, struct bch_key *key) { struct key *keyring_key; const struct user_key_payload *ukp; int ret; keyring_key = request_key(&key_type_user, key_description, NULL); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); down_read(&keyring_key->sem); ukp = dereference_key_locked(keyring_key); if (ukp->datalen == sizeof(*key)) { memcpy(key, ukp->data, ukp->datalen); ret = 0; } else { ret = -EINVAL; } up_read(&keyring_key->sem); key_put(keyring_key); return ret; } #else #include <keyutils.h> static int __bch2_request_key(char *key_description, struct bch_key *key) { key_serial_t key_id; key_id = request_key("user", key_description, NULL, KEY_SPEC_SESSION_KEYRING); if (key_id >= 0) goto got_key; key_id = request_key("user", key_description, NULL, KEY_SPEC_USER_KEYRING); if (key_id >= 0) goto got_key; key_id = request_key("user", key_description, NULL, KEY_SPEC_USER_SESSION_KEYRING); if (key_id >= 0) goto got_key; return -errno; got_key: if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) return -1; return 0; } #include "crypto.h" #endif int bch2_request_key(struct bch_sb *sb, struct bch_key *key) { struct printbuf key_description = PRINTBUF; int ret; prt_printf(&key_description, "bcachefs:"); pr_uuid(&key_description, sb->user_uuid.b); ret = __bch2_request_key(key_description.buf, key); printbuf_exit(&key_description); #ifndef __KERNEL__ if (ret) { char *passphrase = read_passphrase("Enter passphrase: "); struct bch_encrypted_key sb_key; bch2_passphrase_check(sb, passphrase, key, &sb_key); ret = 0; } #endif /* stash with memfd, pass memfd fd to mount */ return ret; } #ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *sb) { key_serial_t key_id; struct printbuf key_description = PRINTBUF; prt_printf(&key_description, "bcachefs:"); pr_uuid(&key_description, sb->user_uuid.b); key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); printbuf_exit(&key_description); if (key_id < 0) return errno; keyctl_revoke(key_id); return 0; } #endif int bch2_decrypt_sb_key(struct bch_fs *c, struct bch_sb_field_crypt *crypt, struct bch_key *key) { struct bch_encrypted_key sb_key = crypt->key; struct bch_key user_key; int ret = 0; /* is key encrypted? */ if (!bch2_key_is_encrypted(&sb_key)) goto out; ret = bch2_request_key(c->disk_sb.sb, &user_key); if (ret) { bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); goto err; } /* decrypt real key: */ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key)); if (ret) goto err; if (bch2_key_is_encrypted(&sb_key)) { bch_err(c, "incorrect encryption key"); ret = -EINVAL; goto err; } out: *key = sb_key.key; err: memzero_explicit(&sb_key, sizeof(sb_key)); memzero_explicit(&user_key, sizeof(user_key)); return ret; } static int bch2_alloc_ciphers(struct bch_fs *c) { int ret; if (!c->chacha20) c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); ret = PTR_ERR_OR_ZERO(c->chacha20); if (ret) { bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); return ret; } if (!c->poly1305) c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); ret = PTR_ERR_OR_ZERO(c->poly1305); if (ret) { bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); return ret; } return 0; } int bch2_disable_encryption(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; struct bch_key key; int ret = -EINVAL; mutex_lock(&c->sb_lock); crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; /* is key encrypted? */ ret = 0; if (bch2_key_is_encrypted(&crypt->key)) goto out; ret = bch2_decrypt_sb_key(c, crypt, &key); if (ret) goto out; crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); crypt->key.key = key; SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); bch2_write_super(c); out: mutex_unlock(&c->sb_lock); return ret; } int bch2_enable_encryption(struct bch_fs *c, bool keyed) { struct bch_encrypted_key key; struct bch_key user_key; struct bch_sb_field_crypt *crypt; int ret = -EINVAL; mutex_lock(&c->sb_lock); /* Do we already have an encryption key? */ if (bch2_sb_field_get(c->disk_sb.sb, crypt)) goto err; ret = bch2_alloc_ciphers(c); if (ret) goto err; key.magic = cpu_to_le64(BCH_KEY_MAGIC); get_random_bytes(&key.key, sizeof(key.key)); if (keyed) { ret = bch2_request_key(c->disk_sb.sb, &user_key); if (ret) { bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); goto err; } ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), &key, sizeof(key)); if (ret) goto err; } ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto err; crypt = bch2_sb_field_resize(&c->disk_sb, crypt, sizeof(*crypt) / sizeof(u64)); if (!crypt) { ret = -BCH_ERR_ENOSPC_sb_crypt; goto err; } crypt->key = key; /* write superblock */ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); bch2_write_super(c); err: mutex_unlock(&c->sb_lock); memzero_explicit(&user_key, sizeof(user_key)); memzero_explicit(&key, sizeof(key)); return ret; } void bch2_fs_encryption_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->poly1305)) crypto_free_shash(c->poly1305); if (!IS_ERR_OR_NULL(c->chacha20)) crypto_free_sync_skcipher(c->chacha20); if (!IS_ERR_OR_NULL(c->sha256)) crypto_free_shash(c->sha256); } int bch2_fs_encryption_init(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; struct bch_key key; int ret = 0; c->sha256 = crypto_alloc_shash("sha256", 0, 0); ret = PTR_ERR_OR_ZERO(c->sha256); if (ret) { bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); goto out; } crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) goto out; ret = bch2_alloc_ciphers(c); if (ret) goto out; ret = bch2_decrypt_sb_key(c, crypt, &key); if (ret) goto out; ret = crypto_skcipher_setkey(&c->chacha20->base, (void *) &key.key, sizeof(key.key)); if (ret) goto out; out: memzero_explicit(&key, sizeof(key)); return ret; } |
9 1 1 5 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/ip.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/xt_cluster.h> static inline u32 nf_ct_orig_ipv4_src(const struct nf_conn *ct) { return (__force u32)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; } static inline const u32 *nf_ct_orig_ipv6_src(const struct nf_conn *ct) { return (__force u32 *)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6; } static inline u_int32_t xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info) { return jhash_1word(ip, info->hash_seed); } static inline u_int32_t xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info) { return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed); } static inline u_int32_t xt_cluster_hash(const struct nf_conn *ct, const struct xt_cluster_match_info *info) { u_int32_t hash = 0; switch(nf_ct_l3num(ct)) { case AF_INET: hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info); break; case AF_INET6: hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info); break; default: WARN_ON(1); break; } return reciprocal_scale(hash, info->total_nodes); } static inline bool xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) { bool is_multicast = false; switch(family) { case NFPROTO_IPV4: is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr); break; case NFPROTO_IPV6: is_multicast = ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr); break; default: WARN_ON(1); break; } return is_multicast; } static bool xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct sk_buff *pskb = (struct sk_buff *)skb; const struct xt_cluster_match_info *info = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned long hash; /* This match assumes that all nodes see the same packets. This can be * achieved if the switch that connects the cluster nodes support some * sort of 'port mirroring'. However, if your switch does not support * this, your cluster nodes can reply ARP request using a multicast MAC * address. Thus, your switch will flood the same packets to the * cluster nodes with the same multicast MAC address. Using a multicast * link address is a RFC 1812 (section 3.3.2) violation, but this works * fine in practise. * * Unfortunately, if you use the multicast MAC address, the link layer * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted * by TCP and others for packets coming to this node. For that reason, * this match mangles skbuff's pkt_type if it detects a packet * addressed to a unicast address but using PACKET_MULTICAST. Yes, I * know, matches should not alter packets, but we are doing this here * because we would need to add a PKTTYPE target for this sole purpose. */ if (!xt_cluster_is_multicast_addr(skb, xt_family(par)) && skb->pkt_type == PACKET_MULTICAST) { pskb->pkt_type = PACKET_HOST; } ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return false; if (ct->master) hash = xt_cluster_hash(ct->master, info); else hash = xt_cluster_hash(ct, info); return !!((1 << hash) & info->node_mask) ^ !!(info->flags & XT_CLUSTER_F_INV); } static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) { struct xt_cluster_match_info *info = par->matchinfo; int ret; if (info->total_nodes > XT_CLUSTER_NODES_MAX) { pr_info_ratelimited("you have exceeded the maximum number of cluster nodes (%u > %u)\n", info->total_nodes, XT_CLUSTER_NODES_MAX); return -EINVAL; } if (info->node_mask >= (1ULL << info->total_nodes)) { pr_info_ratelimited("node mask cannot exceed total number of nodes\n"); return -EDOM; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match xt_cluster_match __read_mostly = { .name = "cluster", .family = NFPROTO_UNSPEC, .match = xt_cluster_mt, .checkentry = xt_cluster_mt_checkentry, .matchsize = sizeof(struct xt_cluster_match_info), .destroy = xt_cluster_mt_destroy, .me = THIS_MODULE, }; static int __init xt_cluster_mt_init(void) { return xt_register_match(&xt_cluster_match); } static void __exit xt_cluster_mt_fini(void) { xt_unregister_match(&xt_cluster_match); } MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: hash-based cluster match"); MODULE_ALIAS("ipt_cluster"); MODULE_ALIAS("ip6t_cluster"); module_init(xt_cluster_mt_init); module_exit(xt_cluster_mt_fini); |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Header for use in defining a given L4 protocol for connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalized L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_protcol.h */ #ifndef _NF_CONNTRACK_L4PROTO_H #define _NF_CONNTRACK_L4PROTO_H #include <linux/netlink.h> #include <net/netlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netns/generic.h> struct seq_file; struct nf_conntrack_l4proto { /* L4 Protocol number. */ u_int8_t l4proto; /* Resolve clashes on insertion races. */ bool allow_clash; /* protoinfo nlattr size, closes a hole */ u16 nlattr_size; /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); const struct nla_policy *nla_policy; struct { int (*nlattr_to_obj)(struct nlattr *tb[], struct net *net, void *data); int (*obj_to_nlattr)(struct sk_buff *skb, const void *data); u16 obj_size; u16 nlattr_max; const struct nla_policy *nla_policy; } ctnl_timeout; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ void (*print_conntrack)(struct seq_file *s, struct nf_conn *); #endif }; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state, u8 l4proto, union nf_inet_addr *outer_daddr); int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmp_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_gre_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); void nf_conntrack_dccp_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); /* Existing built-in generic protocol */ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO IPPROTO_UDPLITE const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...); __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...); #else static inline __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...) {} static inline __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) static inline struct nf_generic_net *nf_generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; } static inline struct nf_tcp_net *nf_tcp_pernet(struct net *net) { return &net->ct.nf_ct_proto.tcp; } static inline struct nf_udp_net *nf_udp_pernet(struct net *net) { return &net->ct.nf_ct_proto.udp; } static inline struct nf_icmp_net *nf_icmp_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmp; } static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmpv6; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline void nf_ct_set_tcp_be_liberal(struct nf_conn *ct) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) { return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && test_bit(IPS_ASSURED_BIT, &ct->status); } #endif #ifdef CONFIG_NF_CT_PROTO_DCCP static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) { return &net->ct.nf_ct_proto.dccp; } #endif #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { return &net->ct.nf_ct_proto.sctp; } #endif #ifdef CONFIG_NF_CT_PROTO_GRE static inline struct nf_gre_net *nf_gre_pernet(struct net *net) { return &net->ct.nf_ct_proto.gre; } #endif #endif /*_NF_CONNTRACK_PROTOCOL_H*/ |
10 4679 4675 4 4658 4684 2 2 8 8 4681 12 4 4684 1 1 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/extable.h> #include <linux/uaccess.h> #include <linux/sched/debug.h> #include <linux/bitfield.h> #include <xen/xen.h> #include <asm/fpu/api.h> #include <asm/fred.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> #include <asm/insn-eval.h> #include <asm/sgx.h> static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr) { int reg_offset = pt_regs_offset(regs, nr); static unsigned long __dummy; if (WARN_ON_ONCE(reg_offset < 0)) return &__dummy; return (unsigned long *)((unsigned long)regs + reg_offset); } static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } static bool ex_handler_default(const struct exception_table_entry *e, struct pt_regs *regs) { if (e->data & EX_FLAG_CLEAR_AX) regs->ax = 0; if (e->data & EX_FLAG_CLEAR_DX) regs->dx = 0; regs->ip = ex_fixup_addr(e); return true; } /* * This is the *very* rare case where we do a "load_unaligned_zeropad()" * and it's a page crosser into a non-existent page. * * This happens when we optimistically load a pathname a word-at-a-time * and the name is less than the full word and the next page is not * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC. * * NOTE! The faulting address is always a 'mov mem,reg' type instruction * of size 'long', and the exception fixup must always point to right * after the instruction. */ static bool ex_handler_zeropad(const struct exception_table_entry *e, struct pt_regs *regs, unsigned long fault_addr) { struct insn insn; const unsigned long mask = sizeof(long) - 1; unsigned long offset, addr, next_ip, len; unsigned long *reg; next_ip = ex_fixup_addr(e); len = next_ip - regs->ip; if (len > MAX_INSN_SIZE) return false; if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN)) return false; if (insn.length != len) return false; if (insn.opcode.bytes[0] != 0x8b) return false; if (insn.opnd_bytes != sizeof(long)) return false; addr = (unsigned long) insn_get_addr_ref(&insn, regs); if (addr == ~0ul) return false; offset = addr & mask; addr = addr & ~mask; if (fault_addr != addr + sizeof(long)) return false; reg = insn_get_modrm_reg_ptr(&insn, regs); if (!reg) return false; *reg = *(unsigned long *)addr >> (offset * 8); return ex_handler_default(e, regs); } static bool ex_handler_fault(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { regs->ax = trapnr; return ex_handler_default(fixup, regs); } static bool ex_handler_sgx(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG; return ex_handler_default(fixup, regs); } /* * Handler for when we fail to restore a task's FPU state. We should never get * here because the FPU state of a task using the FPU (task->thread.fpu.state) * should always be valid. However, past bugs have allowed userspace to set * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). * These caused XRSTOR to fail when switching to the task, leaking the FPU * registers of the task previously executing on the CPU. Mitigate this class * of vulnerability by restoring from the initial state (essentially, zeroing * out all the FPU registers) if we can't restore from the task's FPU state. */ static bool ex_handler_fprestore(const struct exception_table_entry *fixup, struct pt_regs *regs) { regs->ip = ex_fixup_addr(fixup); WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); fpu_reset_from_exception_fixup(); return true; } /* * On x86-64, we end up being imprecise with 'access_ok()', and allow * non-canonical user addresses to make the range comparisons simpler, * and to not have to worry about LAM being enabled. * * In fact, we allow up to one page of "slop" at the sign boundary, * which means that we can do access_ok() by just checking the sign * of the pointer for the common case of having a small access size. */ static bool gp_fault_address_ok(unsigned long fault_address) { #ifdef CONFIG_X86_64 /* Is it in the "user space" part of the non-canonical space? */ if (valid_user_address(fault_address)) return true; /* .. or just above it? */ fault_address -= PAGE_SIZE; if (valid_user_address(fault_address)) return true; #endif return false; } static bool ex_handler_uaccess(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long fault_address) { WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address), "General protection fault in user access. Non-canonical address?"); return ex_handler_default(fixup, regs); } static bool ex_handler_msr(const struct exception_table_entry *fixup, struct pt_regs *regs, bool wrmsr, bool safe, int reg) { if (__ONCE_LITE_IF(!safe && wrmsr)) { pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, regs->ip, (void *)regs->ip); show_stack_regs(regs); } if (__ONCE_LITE_IF(!safe && !wrmsr)) { pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip); show_stack_regs(regs); } if (!wrmsr) { /* Pretend that the read succeeded and returned 0. */ regs->ax = 0; regs->dx = 0; } if (safe) *pt_regs_nr(regs, reg) = -EIO; return ex_handler_default(fixup, regs); } static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, struct pt_regs *regs) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); asm volatile ("mov %0, %%fs" : : "rm" (0)); return ex_handler_default(fixup, regs); } static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, struct pt_regs *regs, int reg, int imm) { *pt_regs_nr(regs, reg) = (long)imm; return ex_handler_default(fixup, regs); } static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long fault_address, int reg, int imm) { regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg); return ex_handler_uaccess(fixup, regs, trapnr, fault_address); } #ifdef CONFIG_X86_FRED static bool ex_handler_eretu(const struct exception_table_entry *fixup, struct pt_regs *regs, unsigned long error_code) { struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax)); unsigned short ss = uregs->ss; unsigned short cs = uregs->cs; /* * Move the NMI bit from the invalid stack frame, which caused ERETU * to fault, to the fault handler's stack frame, thus to unblock NMI * with the fault handler's ERETS instruction ASAP if NMI is blocked. */ regs->fred_ss.nmi = uregs->fred_ss.nmi; /* * Sync event information to uregs, i.e., the ERETU return frame, but * is it safe to write to the ERETU return frame which is just above * current event stack frame? * * The RSP used by FRED to push a stack frame is not the value in %rsp, * it is calculated from %rsp with the following 2 steps: * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line * when an event delivery doesn't trigger a stack level change. * * Here is an example with N*64 (N=1) bytes reserved: * * 64-byte cache line ==> ______________ * |___Reserved___| * |__Event_data__| * |_____SS_______| * |_____RSP______| * |_____FLAGS____| * |_____CS_______| * |_____IP_______| * 64-byte cache line ==> |__Error_code__| <== ERETU return frame * |______________| * |______________| * |______________| * |______________| * |______________| * |______________| * |______________| * 64-byte cache line ==> |______________| <== RSP after step 1) and 2) * |___Reserved___| * |__Event_data__| * |_____SS_______| * |_____RSP______| * |_____FLAGS____| * |_____CS_______| * |_____IP_______| * 64-byte cache line ==> |__Error_code__| <== ERETS return frame * * Thus a new FRED stack frame will always be pushed below a previous * FRED stack frame ((N*64) bytes may be reserved between), and it is * safe to write to a previous FRED stack frame as they never overlap. */ fred_info(uregs)->edata = fred_event_data(regs); uregs->ssx = regs->ssx; uregs->fred_ss.ss = ss; /* The NMI bit was moved away above */ uregs->fred_ss.nmi = 0; uregs->csx = regs->csx; uregs->fred_cs.sl = 0; uregs->fred_cs.wfe = 0; uregs->cs = cs; uregs->orig_ax = error_code; return ex_handler_default(fixup, regs); } #endif int ex_get_fixup_type(unsigned long ip) { const struct exception_table_entry *e = search_exception_tables(ip); return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct exception_table_entry *e; int type, reg, imm; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; extern u32 pnp_bios_is_utter_crap; pnp_bios_is_utter_crap = 1; printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); __asm__ volatile( "movl %0, %%esp\n\t" "jmp *%1\n\t" : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); panic("do_trap: can't hit this"); } #endif e = search_exception_tables(regs->ip); if (!e) return 0; type = FIELD_GET(EX_DATA_TYPE_MASK, e->data); reg = FIELD_GET(EX_DATA_REG_MASK, e->data); imm = FIELD_GET(EX_DATA_IMM_MASK, e->data); switch (type) { case EX_TYPE_DEFAULT: case EX_TYPE_DEFAULT_MCE_SAFE: return ex_handler_default(e, regs); case EX_TYPE_FAULT: case EX_TYPE_FAULT_MCE_SAFE: return ex_handler_fault(e, regs, trapnr); case EX_TYPE_UACCESS: return ex_handler_uaccess(e, regs, trapnr, fault_addr); case EX_TYPE_CLEAR_FS: return ex_handler_clear_fs(e, regs); case EX_TYPE_FPU_RESTORE: return ex_handler_fprestore(e, regs); case EX_TYPE_BPF: return ex_handler_bpf(e, regs); case EX_TYPE_WRMSR: return ex_handler_msr(e, regs, true, false, reg); case EX_TYPE_RDMSR: return ex_handler_msr(e, regs, false, false, reg); case EX_TYPE_WRMSR_SAFE: return ex_handler_msr(e, regs, true, true, reg); case EX_TYPE_RDMSR_SAFE: return ex_handler_msr(e, regs, false, true, reg); case EX_TYPE_WRMSR_IN_MCE: ex_handler_msr_mce(regs, true); break; case EX_TYPE_RDMSR_IN_MCE: ex_handler_msr_mce(regs, false); break; case EX_TYPE_POP_REG: regs->sp += sizeof(long); fallthrough; case EX_TYPE_IMM_REG: return ex_handler_imm_reg(e, regs, reg, imm); case EX_TYPE_FAULT_SGX: return ex_handler_sgx(e, regs, trapnr); case EX_TYPE_UCOPY_LEN: return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); case EX_TYPE_ZEROPAD: return ex_handler_zeropad(e, regs, fault_addr); #ifdef CONFIG_X86_FRED case EX_TYPE_ERETU: return ex_handler_eretu(e, regs, error_code); #endif } BUG(); } extern unsigned int early_recursion_flag; /* Restricted version used during very early boot */ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { /* Ignore early NMIs. */ if (trapnr == X86_TRAP_NMI) return; if (early_recursion_flag > 2) goto halt_loop; /* * Old CPUs leave the high bits of CS on the stack * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. * Xen pv domains are not using the default __KERNEL_CS. */ if (!xen_pv_domain() && regs->cs != __KERNEL_CS) goto fail; /* * The full exception fixup machinery is available as soon as * the early IDT is loaded. This means that it is the * responsibility of extable users to either function correctly * when handlers are invoked early or to simply avoid causing * exceptions before they're ready to handle them. * * This is better than filtering which handlers can be used, * because refusing to call a handler here is guaranteed to * result in a hard-to-debug panic. * * Keep in mind that not all vectors actually get here. Early * page faults, for example, are special. */ if (fixup_exception(regs, trapnr, regs->orig_ax, 0)) return; if (trapnr == X86_TRAP_UD) { if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { /* Skip the ud2. */ regs->ip += LEN_UD2; return; } /* * If this was a BUG and report_bug returns or if this * was just a normal #UD, we want to continue onward and * crash. */ } fail: early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, regs->orig_ax, read_cr2()); show_regs(regs); halt_loop: while (true) halt(); } |
48 36 12 48 48 1 1 35 23 1 1 33 33 63 63 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * HMAC: Keyed-Hashing for Message Authentication (RFC2104). * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> * * The HMAC implementation is derived from USAGI. * Copyright (c) 2002 Kazunori Miyazawa <miyazawa@linux-ipv6.org> / USAGI */ #include <crypto/hmac.h> #include <crypto/internal/hash.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/string.h> struct hmac_ctx { struct crypto_shash *hash; /* Contains 'u8 ipad[statesize];', then 'u8 opad[statesize];' */ u8 pads[]; }; static int hmac_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { int bs = crypto_shash_blocksize(parent); int ds = crypto_shash_digestsize(parent); int ss = crypto_shash_statesize(parent); struct hmac_ctx *tctx = crypto_shash_ctx(parent); struct crypto_shash *hash = tctx->hash; u8 *ipad = &tctx->pads[0]; u8 *opad = &tctx->pads[ss]; SHASH_DESC_ON_STACK(shash, hash); unsigned int i; if (fips_enabled && (keylen < 112 / 8)) return -EINVAL; shash->tfm = hash; if (keylen > bs) { int err; err = crypto_shash_digest(shash, inkey, keylen, ipad); if (err) return err; keylen = ds; } else memcpy(ipad, inkey, keylen); memset(ipad + keylen, 0, bs - keylen); memcpy(opad, ipad, bs); for (i = 0; i < bs; i++) { ipad[i] ^= HMAC_IPAD_VALUE; opad[i] ^= HMAC_OPAD_VALUE; } return crypto_shash_init(shash) ?: crypto_shash_update(shash, ipad, bs) ?: crypto_shash_export(shash, ipad) ?: crypto_shash_init(shash) ?: crypto_shash_update(shash, opad, bs) ?: crypto_shash_export(shash, opad); } static int hmac_export(struct shash_desc *pdesc, void *out) { struct shash_desc *desc = shash_desc_ctx(pdesc); return crypto_shash_export(desc, out); } static int hmac_import(struct shash_desc *pdesc, const void *in) { struct shash_desc *desc = shash_desc_ctx(pdesc); const struct hmac_ctx *tctx = crypto_shash_ctx(pdesc->tfm); desc->tfm = tctx->hash; return crypto_shash_import(desc, in); } static int hmac_init(struct shash_desc *pdesc) { const struct hmac_ctx *tctx = crypto_shash_ctx(pdesc->tfm); return hmac_import(pdesc, &tctx->pads[0]); } static int hmac_update(struct shash_desc *pdesc, const u8 *data, unsigned int nbytes) { struct shash_desc *desc = shash_desc_ctx(pdesc); return crypto_shash_update(desc, data, nbytes); } static int hmac_final(struct shash_desc *pdesc, u8 *out) { struct crypto_shash *parent = pdesc->tfm; int ds = crypto_shash_digestsize(parent); int ss = crypto_shash_statesize(parent); const struct hmac_ctx *tctx = crypto_shash_ctx(parent); const u8 *opad = &tctx->pads[ss]; struct shash_desc *desc = shash_desc_ctx(pdesc); return crypto_shash_final(desc, out) ?: crypto_shash_import(desc, opad) ?: crypto_shash_finup(desc, out, ds, out); } static int hmac_finup(struct shash_desc *pdesc, const u8 *data, unsigned int nbytes, u8 *out) { struct crypto_shash *parent = pdesc->tfm; int ds = crypto_shash_digestsize(parent); int ss = crypto_shash_statesize(parent); const struct hmac_ctx *tctx = crypto_shash_ctx(parent); const u8 *opad = &tctx->pads[ss]; struct shash_desc *desc = shash_desc_ctx(pdesc); return crypto_shash_finup(desc, data, nbytes, out) ?: crypto_shash_import(desc, opad) ?: crypto_shash_finup(desc, out, ds, out); } static int hmac_init_tfm(struct crypto_shash *parent) { struct crypto_shash *hash; struct shash_instance *inst = shash_alg_instance(parent); struct crypto_shash_spawn *spawn = shash_instance_ctx(inst); struct hmac_ctx *tctx = crypto_shash_ctx(parent); hash = crypto_spawn_shash(spawn); if (IS_ERR(hash)) return PTR_ERR(hash); parent->descsize = sizeof(struct shash_desc) + crypto_shash_descsize(hash); tctx->hash = hash; return 0; } static int hmac_clone_tfm(struct crypto_shash *dst, struct crypto_shash *src) { struct hmac_ctx *sctx = crypto_shash_ctx(src); struct hmac_ctx *dctx = crypto_shash_ctx(dst); struct crypto_shash *hash; hash = crypto_clone_shash(sctx->hash); if (IS_ERR(hash)) return PTR_ERR(hash); dctx->hash = hash; return 0; } static void hmac_exit_tfm(struct crypto_shash *parent) { struct hmac_ctx *tctx = crypto_shash_ctx(parent); crypto_free_shash(tctx->hash); } static int hmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_shash_spawn *spawn; struct crypto_alg *alg; struct shash_alg *salg; u32 mask; int err; int ds; int ss; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_shash(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; salg = crypto_spawn_shash_alg(spawn); alg = &salg->base; /* The underlying hash algorithm must not require a key */ err = -EINVAL; if (crypto_shash_alg_needs_key(salg)) goto err_free_inst; ds = salg->digestsize; ss = salg->statesize; if (ds > alg->cra_blocksize || ss < alg->cra_blocksize) goto err_free_inst; err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct hmac_ctx) + (ss * 2); inst->alg.digestsize = ds; inst->alg.statesize = ss; inst->alg.init = hmac_init; inst->alg.update = hmac_update; inst->alg.final = hmac_final; inst->alg.finup = hmac_finup; inst->alg.export = hmac_export; inst->alg.import = hmac_import; inst->alg.setkey = hmac_setkey; inst->alg.init_tfm = hmac_init_tfm; inst->alg.clone_tfm = hmac_clone_tfm; inst->alg.exit_tfm = hmac_exit_tfm; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template hmac_tmpl = { .name = "hmac", .create = hmac_create, .module = THIS_MODULE, }; static int __init hmac_module_init(void) { return crypto_register_template(&hmac_tmpl); } static void __exit hmac_module_exit(void) { crypto_unregister_template(&hmac_tmpl); } subsys_initcall(hmac_module_init); module_exit(hmac_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("HMAC hash algorithm"); MODULE_ALIAS_CRYPTO("hmac"); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | /* * linux/fs/hfs/part_tbl.c * * Copyright (C) 1996-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * Original code to handle the new style Mac partition table based on * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). */ #include "hfs_fs.h" /* * The new style Mac partition map * * For each partition on the media there is a physical block (512-byte * block) containing one of these structures. These blocks are * contiguous starting at block 1. */ struct new_pmap { __be16 pmSig; /* signature */ __be16 reSigPad; /* padding */ __be32 pmMapBlkCnt; /* partition blocks count */ __be32 pmPyPartStart; /* physical block start of partition */ __be32 pmPartBlkCnt; /* physical block count of partition */ u8 pmPartName[32]; /* (null terminated?) string giving the name of this partition */ u8 pmPartType[32]; /* (null terminated?) string giving the type of this partition */ /* a bunch more stuff we don't need */ } __packed; /* * The old style Mac partition map * * The partition map consists for a 2-byte signature followed by an * array of these structures. The map is terminated with an all-zero * one of these. */ struct old_pmap { __be16 pdSig; /* Signature bytes */ struct old_pmap_entry { __be32 pdStart; __be32 pdSize; __be32 pdFSID; } pdEntry[42]; } __packed; /* * hfs_part_find() * * Parse the partition map looking for the * start and length of the 'part'th HFS partition. */ int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size) { struct buffer_head *bh; __be16 *data; int i, size, res; res = -ENOENT; bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK, data); if (!bh) return -EIO; switch (be16_to_cpu(*data)) { case HFS_OLD_PMAP_MAGIC: { struct old_pmap *pm; struct old_pmap_entry *p; pm = (struct old_pmap *)bh->b_data; p = pm->pdEntry; size = 42; for (i = 0; i < size; p++, i++) { if (p->pdStart && p->pdSize && p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) { *part_start += be32_to_cpu(p->pdStart); *part_size = be32_to_cpu(p->pdSize); res = 0; } } break; } case HFS_NEW_PMAP_MAGIC: { struct new_pmap *pm; pm = (struct new_pmap *)bh->b_data; size = be32_to_cpu(pm->pmMapBlkCnt); for (i = 0; i < size;) { if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && (HFS_SB(sb)->part < 0 || HFS_SB(sb)->part == i)) { *part_start += be32_to_cpu(pm->pmPyPartStart); *part_size = be32_to_cpu(pm->pmPartBlkCnt); res = 0; break; } brelse(bh); bh = sb_bread512(sb, *part_start + HFS_PMAP_BLK + ++i, pm); if (!bh) return -EIO; if (pm->pmSig != cpu_to_be16(HFS_NEW_PMAP_MAGIC)) break; } break; } } brelse(bh); return res; } |
3 3 2 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor New timer architecture. * mar/20/00 Daniela Squassoni Disabling/enabling of facilities * negotiation. * 2000-09-04 Henner Eisen dev_hold() / dev_put() for x25_neigh. */ #define pr_fmt(fmt) "X25: " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/uaccess.h> #include <linux/init.h> #include <net/x25.h> LIST_HEAD(x25_neigh_list); DEFINE_RWLOCK(x25_neigh_list_lock); static void x25_t20timer_expiry(struct timer_list *); static void x25_transmit_restart_confirmation(struct x25_neigh *nb); static void x25_transmit_restart_request(struct x25_neigh *nb); /* * Linux set/reset timer routines */ static inline void x25_start_t20timer(struct x25_neigh *nb) { mod_timer(&nb->t20timer, jiffies + nb->t20); } static void x25_t20timer_expiry(struct timer_list *t) { struct x25_neigh *nb = from_timer(nb, t, t20timer); x25_transmit_restart_request(nb); x25_start_t20timer(nb); } static inline void x25_stop_t20timer(struct x25_neigh *nb) { del_timer(&nb->t20timer); } /* * This handles all restart and diagnostic frames. */ void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb, unsigned short frametype) { struct sk_buff *skbn; switch (frametype) { case X25_RESTART_REQUEST: switch (nb->state) { case X25_LINK_STATE_0: /* This can happen when the x25 module just gets loaded * and doesn't know layer 2 has already connected */ nb->state = X25_LINK_STATE_3; x25_transmit_restart_confirmation(nb); break; case X25_LINK_STATE_2: x25_stop_t20timer(nb); nb->state = X25_LINK_STATE_3; break; case X25_LINK_STATE_3: /* clear existing virtual calls */ x25_kill_by_neigh(nb); x25_transmit_restart_confirmation(nb); break; } break; case X25_RESTART_CONFIRMATION: switch (nb->state) { case X25_LINK_STATE_2: x25_stop_t20timer(nb); nb->state = X25_LINK_STATE_3; break; case X25_LINK_STATE_3: /* clear existing virtual calls */ x25_kill_by_neigh(nb); x25_transmit_restart_request(nb); nb->state = X25_LINK_STATE_2; x25_start_t20timer(nb); break; } break; case X25_DIAGNOSTIC: if (!pskb_may_pull(skb, X25_STD_MIN_LEN + 4)) break; pr_warn("diagnostic #%d - %02X %02X %02X\n", skb->data[3], skb->data[4], skb->data[5], skb->data[6]); break; default: pr_warn("received unknown %02X with LCI 000\n", frametype); break; } if (nb->state == X25_LINK_STATE_3) while ((skbn = skb_dequeue(&nb->queue)) != NULL) x25_send_frame(skbn, nb); } /* * This routine is called when a Restart Request is needed */ static void x25_transmit_restart_request(struct x25_neigh *nb) { unsigned char *dptr; int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2; struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); if (!skb) return; skb_reserve(skb, X25_MAX_L2_LEN); dptr = skb_put(skb, X25_STD_MIN_LEN + 2); *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ; *dptr++ = 0x00; *dptr++ = X25_RESTART_REQUEST; *dptr++ = 0x00; *dptr++ = 0; skb->sk = NULL; x25_send_frame(skb, nb); } /* * This routine is called when a Restart Confirmation is needed */ static void x25_transmit_restart_confirmation(struct x25_neigh *nb) { unsigned char *dptr; int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN; struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); if (!skb) return; skb_reserve(skb, X25_MAX_L2_LEN); dptr = skb_put(skb, X25_STD_MIN_LEN); *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ; *dptr++ = 0x00; *dptr++ = X25_RESTART_CONFIRMATION; skb->sk = NULL; x25_send_frame(skb, nb); } /* * This routine is called when a Clear Request is needed outside of the context * of a connected socket. */ void x25_transmit_clear_request(struct x25_neigh *nb, unsigned int lci, unsigned char cause) { unsigned char *dptr; int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2; struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); if (!skb) return; skb_reserve(skb, X25_MAX_L2_LEN); dptr = skb_put(skb, X25_STD_MIN_LEN + 2); *dptr++ = ((lci >> 8) & 0x0F) | (nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ); *dptr++ = (lci >> 0) & 0xFF; *dptr++ = X25_CLEAR_REQUEST; *dptr++ = cause; *dptr++ = 0x00; skb->sk = NULL; x25_send_frame(skb, nb); } void x25_transmit_link(struct sk_buff *skb, struct x25_neigh *nb) { switch (nb->state) { case X25_LINK_STATE_0: skb_queue_tail(&nb->queue, skb); nb->state = X25_LINK_STATE_1; x25_establish_link(nb); break; case X25_LINK_STATE_1: case X25_LINK_STATE_2: skb_queue_tail(&nb->queue, skb); break; case X25_LINK_STATE_3: x25_send_frame(skb, nb); break; } } /* * Called when the link layer has become established. */ void x25_link_established(struct x25_neigh *nb) { switch (nb->state) { case X25_LINK_STATE_0: case X25_LINK_STATE_1: x25_transmit_restart_request(nb); nb->state = X25_LINK_STATE_2; x25_start_t20timer(nb); break; } } /* * Called when the link layer has terminated, or an establishment * request has failed. */ void x25_link_terminated(struct x25_neigh *nb) { nb->state = X25_LINK_STATE_0; skb_queue_purge(&nb->queue); x25_stop_t20timer(nb); /* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */ x25_kill_by_neigh(nb); } /* * Add a new device. */ void x25_link_device_up(struct net_device *dev) { struct x25_neigh *nb = kmalloc(sizeof(*nb), GFP_ATOMIC); if (!nb) return; skb_queue_head_init(&nb->queue); timer_setup(&nb->t20timer, x25_t20timer_expiry, 0); dev_hold(dev); nb->dev = dev; nb->state = X25_LINK_STATE_0; nb->extended = 0; /* * Enables negotiation */ nb->global_facil_mask = X25_MASK_REVERSE | X25_MASK_THROUGHPUT | X25_MASK_PACKET_SIZE | X25_MASK_WINDOW_SIZE; nb->t20 = sysctl_x25_restart_request_timeout; refcount_set(&nb->refcnt, 1); write_lock_bh(&x25_neigh_list_lock); list_add(&nb->node, &x25_neigh_list); write_unlock_bh(&x25_neigh_list_lock); } /** * __x25_remove_neigh - remove neighbour from x25_neigh_list * @nb: - neigh to remove * * Remove neighbour from x25_neigh_list. If it was there. * Caller must hold x25_neigh_list_lock. */ static void __x25_remove_neigh(struct x25_neigh *nb) { if (nb->node.next) { list_del(&nb->node); x25_neigh_put(nb); } } /* * A device has been removed, remove its links. */ void x25_link_device_down(struct net_device *dev) { struct x25_neigh *nb; struct list_head *entry, *tmp; write_lock_bh(&x25_neigh_list_lock); list_for_each_safe(entry, tmp, &x25_neigh_list) { nb = list_entry(entry, struct x25_neigh, node); if (nb->dev == dev) { __x25_remove_neigh(nb); dev_put(dev); } } write_unlock_bh(&x25_neigh_list_lock); } /* * Given a device, return the neighbour address. */ struct x25_neigh *x25_get_neigh(struct net_device *dev) { struct x25_neigh *nb, *use = NULL; read_lock_bh(&x25_neigh_list_lock); list_for_each_entry(nb, &x25_neigh_list, node) { if (nb->dev == dev) { use = nb; break; } } if (use) x25_neigh_hold(use); read_unlock_bh(&x25_neigh_list_lock); return use; } /* * Handle the ioctls that control the subscription functions. */ int x25_subscr_ioctl(unsigned int cmd, void __user *arg) { struct x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; if (cmd != SIOCX25GSUBSCRIP && cmd != SIOCX25SSUBSCRIP) goto out; rc = -EFAULT; if (copy_from_user(&x25_subscr, arg, sizeof(x25_subscr))) goto out; rc = -EINVAL; if ((dev = x25_dev_get(x25_subscr.device)) == NULL) goto out; if ((nb = x25_get_neigh(dev)) == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(arg, &x25_subscr, sizeof(x25_subscr)) ? -EFAULT : 0; } else { rc = -EINVAL; if (!(x25_subscr.extended && x25_subscr.extended != 1)) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } /* * Release all memory associated with X.25 neighbour structures. */ void __exit x25_link_free(void) { struct x25_neigh *nb; struct list_head *entry, *tmp; write_lock_bh(&x25_neigh_list_lock); list_for_each_safe(entry, tmp, &x25_neigh_list) { struct net_device *dev; nb = list_entry(entry, struct x25_neigh, node); dev = nb->dev; __x25_remove_neigh(nb); dev_put(dev); } write_unlock_bh(&x25_neigh_list_lock); } |
2 2 1 1 1 5 5 1 2 1 4 5 5 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | // SPDX-License-Identifier: GPL-2.0-only /* * Binary Increase Congestion control for TCP * Home page: * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC * This is from the implementation of BICTCP in * Lison-Xu, Kahaled Harfoush, and Injong Rhee. * "Binary Increase Congestion Control for Fast, Long Distance * Networks" in InfoComm 2004 * Available from: * http://netsrv.csc.ncsu.edu/export/bitcp.pdf * * Unless BIC is enabled and congestion window is large * this behaves the same as the original Reno. */ #include <linux/mm.h> #include <linux/module.h> #include <net/tcp.h> #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ #define BICTCP_B 4 /* * In binary search, * go to point (max+min)/N */ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ static int initial_ssthresh; static int smooth_part = 20; module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); module_param(max_increment, int, 0644); MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); module_param(low_window, int, 0644); MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 epoch_start; /* beginning of an epoch */ #define ACK_RATIO_SHIFT 4 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ }; static inline void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; ca->epoch_start = 0; ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } static void bictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } /* * Compute congestion window to use. */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { if (ca->last_cwnd == cwnd && (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; ca->last_cwnd = cwnd; ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) /* record the beginning of an epoch */ ca->epoch_start = tcp_jiffies32; /* start off normal */ if (cwnd <= low_window) { ca->cnt = cwnd; return; } /* binary increase */ if (cwnd < ca->last_max_cwnd) { __u32 dist = (ca->last_max_cwnd - cwnd) / BICTCP_B; if (dist > max_increment) /* linear increase */ ca->cnt = cwnd / max_increment; else if (dist <= 1U) /* binary search increase */ ca->cnt = (cwnd * smooth_part) / BICTCP_B; else /* binary search increase */ ca->cnt = cwnd / dist; } else { /* slow start AMD linear increase */ if (cwnd < ca->last_max_cwnd + BICTCP_B) /* slow start */ ca->cnt = (cwnd * smooth_part) / BICTCP_B; else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) /* slow start */ ca->cnt = (cwnd * (BICTCP_B-1)) / (cwnd - ca->last_max_cwnd); else /* linear increase */ ca->cnt = cwnd / max_increment; } /* if in slow start or link utilization is very low */ if (ca->last_max_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; if (ca->cnt == 0) /* cannot be zero */ ca->cnt = 1; } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } bictcp_update(ca, tcp_snd_cwnd(tp)); tcp_cong_avoid_ai(tp, ca->cnt, acked); } /* * behave like Reno until low_window is reached, * then increase congestion window slowly */ static u32 bictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else ca->last_max_cwnd = tcp_snd_cwnd(tp); if (tcp_snd_cwnd(tp) <= low_window) return max(tcp_snd_cwnd(tp) >> 1U, 2U); else return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) bictcp_reset(inet_csk_ca(sk)); } /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); ca->delayed_ack += sample->pkts_acked - (ca->delayed_ack >> ACK_RATIO_SHIFT); } } static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", }; static int __init bictcp_register(void) { BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&bictcp); } static void __exit bictcp_unregister(void) { tcp_unregister_congestion_control(&bictcp); } module_init(bictcp_register); module_exit(bictcp_unregister); MODULE_AUTHOR("Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("BIC TCP"); |
1 1 1 1 1 2 2 7 7 4 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 | // SPDX-License-Identifier: GPL-2.0 /* * thermal.c - Generic Thermal Management Sysfs support. * * Copyright (C) 2008 Intel Corp * Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com> * Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/kdev_t.h> #include <linux/idr.h> #include <linux/list_sort.h> #include <linux/thermal.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/of.h> #include <linux/suspend.h> #define CREATE_TRACE_POINTS #include "thermal_trace.h" #include "thermal_core.h" #include "thermal_hwmon.h" static DEFINE_IDA(thermal_tz_ida); static DEFINE_IDA(thermal_cdev_ida); static LIST_HEAD(thermal_tz_list); static LIST_HEAD(thermal_cdev_list); static LIST_HEAD(thermal_governor_list); static DEFINE_MUTEX(thermal_list_lock); static DEFINE_MUTEX(thermal_governor_lock); static struct thermal_governor *def_governor; /* * Governor section: set of functions to handle thermal governors * * Functions to help in the life cycle of thermal governors within * the thermal core and by the thermal governor code. */ static struct thermal_governor *__find_governor(const char *name) { struct thermal_governor *pos; if (!name || !name[0]) return def_governor; list_for_each_entry(pos, &thermal_governor_list, governor_list) if (!strncasecmp(name, pos->name, THERMAL_NAME_LENGTH)) return pos; return NULL; } /** * bind_previous_governor() - bind the previous governor of the thermal zone * @tz: a valid pointer to a struct thermal_zone_device * @failed_gov_name: the name of the governor that failed to register * * Register the previous governor of the thermal zone after a new * governor has failed to be bound. */ static void bind_previous_governor(struct thermal_zone_device *tz, const char *failed_gov_name) { if (tz->governor && tz->governor->bind_to_tz) { if (tz->governor->bind_to_tz(tz)) { dev_err(&tz->device, "governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n", failed_gov_name, tz->governor->name, tz->type); tz->governor = NULL; } } } /** * thermal_set_governor() - Switch to another governor * @tz: a valid pointer to a struct thermal_zone_device * @new_gov: pointer to the new governor * * Change the governor of thermal zone @tz. * * Return: 0 on success, an error if the new governor's bind_to_tz() failed. */ static int thermal_set_governor(struct thermal_zone_device *tz, struct thermal_governor *new_gov) { int ret = 0; if (tz->governor && tz->governor->unbind_from_tz) tz->governor->unbind_from_tz(tz); if (new_gov && new_gov->bind_to_tz) { ret = new_gov->bind_to_tz(tz); if (ret) { bind_previous_governor(tz, new_gov->name); return ret; } } tz->governor = new_gov; return ret; } int thermal_register_governor(struct thermal_governor *governor) { int err; const char *name; struct thermal_zone_device *pos; if (!governor) return -EINVAL; mutex_lock(&thermal_governor_lock); err = -EBUSY; if (!__find_governor(governor->name)) { bool match_default; err = 0; list_add(&governor->governor_list, &thermal_governor_list); match_default = !strncmp(governor->name, DEFAULT_THERMAL_GOVERNOR, THERMAL_NAME_LENGTH); if (!def_governor && match_default) def_governor = governor; } mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { /* * only thermal zones with specified tz->tzp->governor_name * may run with tz->govenor unset */ if (pos->governor) continue; name = pos->tzp->governor_name; if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) { int ret; ret = thermal_set_governor(pos, governor); if (ret) dev_err(&pos->device, "Failed to set governor %s for thermal zone %s: %d\n", governor->name, pos->type, ret); } } mutex_unlock(&thermal_list_lock); mutex_unlock(&thermal_governor_lock); return err; } void thermal_unregister_governor(struct thermal_governor *governor) { struct thermal_zone_device *pos; if (!governor) return; mutex_lock(&thermal_governor_lock); if (!__find_governor(governor->name)) goto exit; mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { if (!strncasecmp(pos->governor->name, governor->name, THERMAL_NAME_LENGTH)) thermal_set_governor(pos, NULL); } mutex_unlock(&thermal_list_lock); list_del(&governor->governor_list); exit: mutex_unlock(&thermal_governor_lock); } int thermal_zone_device_set_policy(struct thermal_zone_device *tz, char *policy) { struct thermal_governor *gov; int ret = -EINVAL; mutex_lock(&thermal_governor_lock); mutex_lock(&tz->lock); gov = __find_governor(strim(policy)); if (!gov) goto exit; ret = thermal_set_governor(tz, gov); exit: mutex_unlock(&tz->lock); mutex_unlock(&thermal_governor_lock); thermal_notify_tz_gov_change(tz, policy); return ret; } int thermal_build_list_of_policies(char *buf) { struct thermal_governor *pos; ssize_t count = 0; mutex_lock(&thermal_governor_lock); list_for_each_entry(pos, &thermal_governor_list, governor_list) { count += sysfs_emit_at(buf, count, "%s ", pos->name); } count += sysfs_emit_at(buf, count, "\n"); mutex_unlock(&thermal_governor_lock); return count; } static void __init thermal_unregister_governors(void) { struct thermal_governor **governor; for_each_governor_table(governor) thermal_unregister_governor(*governor); } static int __init thermal_register_governors(void) { int ret = 0; struct thermal_governor **governor; for_each_governor_table(governor) { ret = thermal_register_governor(*governor); if (ret) { pr_err("Failed to register governor: '%s'", (*governor)->name); break; } pr_info("Registered thermal governor '%s'", (*governor)->name); } if (ret) { struct thermal_governor **gov; for_each_governor_table(gov) { if (gov == governor) break; thermal_unregister_governor(*gov); } } return ret; } /* * Zone update section: main control loop applied to each zone while monitoring * in polling mode. The monitoring is done using a workqueue. * Same update may be done on a zone by calling thermal_zone_device_update(). * * An update means: * - Non-critical trips will invoke the governor responsible for that zone; * - Hot trips will produce a notification to userspace; * - Critical trip point will cause a system shutdown. */ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz, unsigned long delay) { if (delay) mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, delay); else cancel_delayed_work(&tz->poll_queue); } static void monitor_thermal_zone(struct thermal_zone_device *tz) { if (tz->mode != THERMAL_DEVICE_ENABLED) thermal_zone_device_set_polling(tz, 0); else if (tz->passive > 0) thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies); else if (tz->polling_delay_jiffies) thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies); } static struct thermal_governor *thermal_get_tz_governor(struct thermal_zone_device *tz) { if (tz->governor) return tz->governor; return def_governor; } void thermal_governor_update_tz(struct thermal_zone_device *tz, enum thermal_notify_event reason) { if (!tz->governor || !tz->governor->update_tz) return; tz->governor->update_tz(tz, reason); } static void thermal_zone_device_halt(struct thermal_zone_device *tz, bool shutdown) { /* * poweroff_delay_ms must be a carefully profiled positive value. * Its a must for forced_emergency_poweroff_work to be scheduled. */ int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS; const char *msg = "Temperature too high"; dev_emerg(&tz->device, "%s: critical temperature reached\n", tz->type); if (shutdown) hw_protection_shutdown(msg, poweroff_delay_ms); else hw_protection_reboot(msg, poweroff_delay_ms); } void thermal_zone_device_critical(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, true); } EXPORT_SYMBOL(thermal_zone_device_critical); void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, false); } static void handle_critical_trips(struct thermal_zone_device *tz, const struct thermal_trip *trip) { trace_thermal_zone_trip(tz, thermal_zone_trip_id(tz, trip), trip->type); if (trip->type == THERMAL_TRIP_CRITICAL) tz->ops.critical(tz); else if (tz->ops.hot) tz->ops.hot(tz); } static void handle_thermal_trip(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct list_head *way_up_list, struct list_head *way_down_list) { const struct thermal_trip *trip = &td->trip; int old_threshold; if (trip->temperature == THERMAL_TEMP_INVALID) return; /* * If the trip temperature or hysteresis has been updated recently, * the threshold needs to be computed again using the new values. * However, its initial value still reflects the old ones and that * is what needs to be compared with the previous zone temperature * to decide which action to take. */ old_threshold = td->threshold; td->threshold = trip->temperature; if (tz->last_temperature >= old_threshold && tz->last_temperature != THERMAL_TEMP_INVALID) { /* * Mitigation is under way, so it needs to stop if the zone * temperature falls below the low temperature of the trip. * In that case, the trip temperature becomes the new threshold. */ if (tz->temperature < trip->temperature - trip->hysteresis) { list_add(&td->notify_list_node, way_down_list); td->notify_temp = trip->temperature - trip->hysteresis; if (trip->type == THERMAL_TRIP_PASSIVE) { tz->passive--; WARN_ON(tz->passive < 0); } } else { td->threshold -= trip->hysteresis; } } else if (tz->temperature >= trip->temperature) { /* * There is no mitigation under way, so it needs to be started * if the zone temperature exceeds the trip one. The new * threshold is then set to the low temperature of the trip. */ list_add_tail(&td->notify_list_node, way_up_list); td->notify_temp = trip->temperature; td->threshold -= trip->hysteresis; if (trip->type == THERMAL_TRIP_PASSIVE) tz->passive++; else if (trip->type == THERMAL_TRIP_CRITICAL || trip->type == THERMAL_TRIP_HOT) handle_critical_trips(tz, trip); } } static void update_temperature(struct thermal_zone_device *tz) { int temp, ret; ret = __thermal_zone_get_temp(tz, &temp); if (ret) { if (ret != -EAGAIN) dev_warn(&tz->device, "failed to read out thermal zone (%d)\n", ret); return; } tz->last_temperature = tz->temperature; tz->temperature = temp; trace_thermal_temperature(tz); thermal_genl_sampling_temp(tz->id, temp); } static void thermal_zone_device_check(struct work_struct *work) { struct thermal_zone_device *tz = container_of(work, struct thermal_zone_device, poll_queue.work); thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } static void thermal_zone_device_init(struct thermal_zone_device *tz) { struct thermal_instance *pos; INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check); tz->temperature = THERMAL_TEMP_INVALID; tz->passive = 0; tz->prev_low_trip = -INT_MAX; tz->prev_high_trip = INT_MAX; list_for_each_entry(pos, &tz->thermal_instances, tz_node) pos->initialized = false; } static void thermal_governor_trip_crossed(struct thermal_governor *governor, struct thermal_zone_device *tz, const struct thermal_trip *trip, bool crossed_up) { if (governor->trip_crossed) governor->trip_crossed(tz, trip, crossed_up); } static int thermal_trip_notify_cmp(void *ascending, const struct list_head *a, const struct list_head *b) { struct thermal_trip_desc *tda = container_of(a, struct thermal_trip_desc, notify_list_node); struct thermal_trip_desc *tdb = container_of(b, struct thermal_trip_desc, notify_list_node); int ret = tdb->notify_temp - tda->notify_temp; return ascending ? ret : -ret; } void __thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { struct thermal_governor *governor = thermal_get_tz_governor(tz); struct thermal_trip_desc *td; LIST_HEAD(way_down_list); LIST_HEAD(way_up_list); if (tz->suspended) return; if (!thermal_zone_device_is_enabled(tz)) return; update_temperature(tz); if (tz->temperature == THERMAL_TEMP_INVALID) return; __thermal_zone_set_trips(tz); tz->notify_event = event; for_each_trip_desc(tz, td) handle_thermal_trip(tz, td, &way_up_list, &way_down_list); list_sort(&way_up_list, &way_up_list, thermal_trip_notify_cmp); list_for_each_entry(td, &way_up_list, notify_list_node) { thermal_notify_tz_trip_up(tz, &td->trip); thermal_debug_tz_trip_up(tz, &td->trip); thermal_governor_trip_crossed(governor, tz, &td->trip, true); } list_sort(NULL, &way_down_list, thermal_trip_notify_cmp); list_for_each_entry(td, &way_down_list, notify_list_node) { thermal_notify_tz_trip_down(tz, &td->trip); thermal_debug_tz_trip_down(tz, &td->trip); thermal_governor_trip_crossed(governor, tz, &td->trip, false); } if (governor->manage) governor->manage(tz); thermal_debug_update_trip_stats(tz); monitor_thermal_zone(tz); } static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { int ret = 0; mutex_lock(&tz->lock); /* do nothing if mode isn't changing */ if (mode == tz->mode) { mutex_unlock(&tz->lock); return ret; } if (tz->ops.change_mode) ret = tz->ops.change_mode(tz, mode); if (!ret) tz->mode = mode; __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); mutex_unlock(&tz->lock); if (mode == THERMAL_DEVICE_ENABLED) thermal_notify_tz_enable(tz); else thermal_notify_tz_disable(tz); return ret; } int thermal_zone_device_enable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_ENABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_enable); int thermal_zone_device_disable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_disable); int thermal_zone_device_is_enabled(struct thermal_zone_device *tz) { lockdep_assert_held(&tz->lock); return tz->mode == THERMAL_DEVICE_ENABLED; } static bool thermal_zone_is_present(struct thermal_zone_device *tz) { return !list_empty(&tz->node); } void thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { mutex_lock(&tz->lock); if (thermal_zone_is_present(tz)) __thermal_zone_device_update(tz, event); mutex_unlock(&tz->lock); } EXPORT_SYMBOL_GPL(thermal_zone_device_update); int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), void *data) { struct thermal_governor *gov; int ret = 0; mutex_lock(&thermal_governor_lock); list_for_each_entry(gov, &thermal_governor_list, governor_list) { ret = cb(gov, data); if (ret) break; } mutex_unlock(&thermal_governor_lock); return ret; } int for_each_thermal_cooling_device(int (*cb)(struct thermal_cooling_device *, void *), void *data) { struct thermal_cooling_device *cdev; int ret = 0; mutex_lock(&thermal_list_lock); list_for_each_entry(cdev, &thermal_cdev_list, node) { ret = cb(cdev, data); if (ret) break; } mutex_unlock(&thermal_list_lock); return ret; } int for_each_thermal_zone(int (*cb)(struct thermal_zone_device *, void *), void *data) { struct thermal_zone_device *tz; int ret = 0; mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { ret = cb(tz, data); if (ret) break; } mutex_unlock(&thermal_list_lock); return ret; } struct thermal_zone_device *thermal_zone_get_by_id(int id) { struct thermal_zone_device *tz, *match = NULL; mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { if (tz->id == id) { match = tz; break; } } mutex_unlock(&thermal_list_lock); return match; } /* * Device management section: cooling devices, zones devices, and binding * * Set of functions provided by the thermal core for: * - cooling devices lifecycle: registration, unregistration, * binding, and unbinding. * - thermal zone devices lifecycle: registration, unregistration, * binding, and unbinding. */ /** * thermal_bind_cdev_to_trip - bind a cooling device to a thermal zone * @tz: pointer to struct thermal_zone_device * @trip: trip point the cooling devices is associated with in this zone. * @cdev: pointer to struct thermal_cooling_device * @upper: the Maximum cooling state for this trip point. * THERMAL_NO_LIMIT means no upper limit, * and the cooling device can be in max_state. * @lower: the Minimum cooling state can be used for this trip point. * THERMAL_NO_LIMIT means no lower limit, * and the cooling device can be in cooling state 0. * @weight: The weight of the cooling device to be bound to the * thermal zone. Use THERMAL_WEIGHT_DEFAULT for the * default value * * This interface function bind a thermal cooling device to the certain trip * point of a thermal zone device. * This function is usually called in the thermal zone device .bind callback. * * Return: 0 on success, the proper error value otherwise. */ int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_cooling_device *cdev, unsigned long upper, unsigned long lower, unsigned int weight) { struct thermal_instance *dev; struct thermal_instance *pos; struct thermal_zone_device *pos1; struct thermal_cooling_device *pos2; bool upper_no_limit; int result; list_for_each_entry(pos1, &thermal_tz_list, node) { if (pos1 == tz) break; } list_for_each_entry(pos2, &thermal_cdev_list, node) { if (pos2 == cdev) break; } if (tz != pos1 || cdev != pos2) return -EINVAL; /* lower default 0, upper default max_state */ lower = lower == THERMAL_NO_LIMIT ? 0 : lower; if (upper == THERMAL_NO_LIMIT) { upper = cdev->max_state; upper_no_limit = true; } else { upper_no_limit = false; } if (lower > upper || upper > cdev->max_state) return -EINVAL; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; dev->tz = tz; dev->cdev = cdev; dev->trip = trip; dev->upper = upper; dev->upper_no_limit = upper_no_limit; dev->lower = lower; dev->target = THERMAL_NO_TARGET; dev->weight = weight; result = ida_alloc(&tz->ida, GFP_KERNEL); if (result < 0) goto free_mem; dev->id = result; sprintf(dev->name, "cdev%d", dev->id); result = sysfs_create_link(&tz->device.kobj, &cdev->device.kobj, dev->name); if (result) goto release_ida; snprintf(dev->attr_name, sizeof(dev->attr_name), "cdev%d_trip_point", dev->id); sysfs_attr_init(&dev->attr.attr); dev->attr.attr.name = dev->attr_name; dev->attr.attr.mode = 0444; dev->attr.show = trip_point_show; result = device_create_file(&tz->device, &dev->attr); if (result) goto remove_symbol_link; snprintf(dev->weight_attr_name, sizeof(dev->weight_attr_name), "cdev%d_weight", dev->id); sysfs_attr_init(&dev->weight_attr.attr); dev->weight_attr.attr.name = dev->weight_attr_name; dev->weight_attr.attr.mode = S_IWUSR | S_IRUGO; dev->weight_attr.show = weight_show; dev->weight_attr.store = weight_store; result = device_create_file(&tz->device, &dev->weight_attr); if (result) goto remove_trip_file; mutex_lock(&tz->lock); mutex_lock(&cdev->lock); list_for_each_entry(pos, &tz->thermal_instances, tz_node) if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) { result = -EEXIST; break; } if (!result) { list_add_tail(&dev->tz_node, &tz->thermal_instances); list_add_tail(&dev->cdev_node, &cdev->thermal_instances); atomic_set(&tz->need_update, 1); thermal_governor_update_tz(tz, THERMAL_TZ_BIND_CDEV); } mutex_unlock(&cdev->lock); mutex_unlock(&tz->lock); if (!result) return 0; device_remove_file(&tz->device, &dev->weight_attr); remove_trip_file: device_remove_file(&tz->device, &dev->attr); remove_symbol_link: sysfs_remove_link(&tz->device.kobj, dev->name); release_ida: ida_free(&tz->ida, dev->id); free_mem: kfree(dev); return result; } EXPORT_SYMBOL_GPL(thermal_bind_cdev_to_trip); int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, int trip_index, struct thermal_cooling_device *cdev, unsigned long upper, unsigned long lower, unsigned int weight) { if (trip_index < 0 || trip_index >= tz->num_trips) return -EINVAL; return thermal_bind_cdev_to_trip(tz, &tz->trips[trip_index].trip, cdev, upper, lower, weight); } EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device); /** * thermal_unbind_cdev_from_trip - unbind a cooling device from a thermal zone. * @tz: pointer to a struct thermal_zone_device. * @trip: trip point the cooling devices is associated with in this zone. * @cdev: pointer to a struct thermal_cooling_device. * * This interface function unbind a thermal cooling device from the certain * trip point of a thermal zone device. * This function is usually called in the thermal zone device .unbind callback. * * Return: 0 on success, the proper error value otherwise. */ int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_cooling_device *cdev) { struct thermal_instance *pos, *next; mutex_lock(&tz->lock); mutex_lock(&cdev->lock); list_for_each_entry_safe(pos, next, &tz->thermal_instances, tz_node) { if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) { list_del(&pos->tz_node); list_del(&pos->cdev_node); thermal_governor_update_tz(tz, THERMAL_TZ_UNBIND_CDEV); mutex_unlock(&cdev->lock); mutex_unlock(&tz->lock); goto unbind; } } mutex_unlock(&cdev->lock); mutex_unlock(&tz->lock); return -ENODEV; unbind: device_remove_file(&tz->device, &pos->weight_attr); device_remove_file(&tz->device, &pos->attr); sysfs_remove_link(&tz->device.kobj, pos->name); ida_free(&tz->ida, pos->id); kfree(pos); return 0; } EXPORT_SYMBOL_GPL(thermal_unbind_cdev_from_trip); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz, int trip_index, struct thermal_cooling_device *cdev) { if (trip_index < 0 || trip_index >= tz->num_trips) return -EINVAL; return thermal_unbind_cdev_from_trip(tz, &tz->trips[trip_index].trip, cdev); } EXPORT_SYMBOL_GPL(thermal_zone_unbind_cooling_device); static void thermal_release(struct device *dev) { struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; if (!strncmp(dev_name(dev), "thermal_zone", sizeof("thermal_zone") - 1)) { tz = to_thermal_zone(dev); thermal_zone_destroy_device_groups(tz); mutex_destroy(&tz->lock); complete(&tz->removal); } else if (!strncmp(dev_name(dev), "cooling_device", sizeof("cooling_device") - 1)) { cdev = to_cooling_device(dev); thermal_cooling_device_destroy_sysfs(cdev); kfree_const(cdev->type); ida_free(&thermal_cdev_ida, cdev->id); kfree(cdev); } } static struct class *thermal_class; static inline void print_bind_err_msg(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev, int ret) { dev_err(&tz->device, "binding zone %s with cdev %s failed:%d\n", tz->type, cdev->type, ret); } static void bind_cdev(struct thermal_cooling_device *cdev) { int ret; struct thermal_zone_device *pos = NULL; list_for_each_entry(pos, &thermal_tz_list, node) { if (pos->ops.bind) { ret = pos->ops.bind(pos, cdev); if (ret) print_bind_err_msg(pos, cdev, ret); } } } /** * __thermal_cooling_device_register() - register a new thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * It also gives the opportunity to link the cooling device to a device tree * node, so that it can be bound to a thermal zone created out of device tree. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ static struct thermal_cooling_device * __thermal_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device *cdev; struct thermal_zone_device *pos = NULL; unsigned long current_state; int id, ret; if (!ops || !ops->get_max_state || !ops->get_cur_state || !ops->set_cur_state) return ERR_PTR(-EINVAL); if (!thermal_class) return ERR_PTR(-ENODEV); cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) return ERR_PTR(-ENOMEM); ret = ida_alloc(&thermal_cdev_ida, GFP_KERNEL); if (ret < 0) goto out_kfree_cdev; cdev->id = ret; id = ret; cdev->type = kstrdup_const(type ? type : "", GFP_KERNEL); if (!cdev->type) { ret = -ENOMEM; goto out_ida_remove; } mutex_init(&cdev->lock); INIT_LIST_HEAD(&cdev->thermal_instances); cdev->np = np; cdev->ops = ops; cdev->updated = false; cdev->device.class = thermal_class; cdev->devdata = devdata; ret = cdev->ops->get_max_state(cdev, &cdev->max_state); if (ret) goto out_cdev_type; ret = cdev->ops->get_cur_state(cdev, ¤t_state); if (ret) goto out_cdev_type; thermal_cooling_device_setup_sysfs(cdev); ret = dev_set_name(&cdev->device, "cooling_device%d", cdev->id); if (ret) goto out_cooling_dev; ret = device_register(&cdev->device); if (ret) { /* thermal_release() handles rest of the cleanup */ put_device(&cdev->device); return ERR_PTR(ret); } thermal_debug_cdev_add(cdev, current_state); /* Add 'this' new cdev to the global cdev list */ mutex_lock(&thermal_list_lock); list_add(&cdev->node, &thermal_cdev_list); /* Update binding information for 'this' new cdev */ bind_cdev(cdev); list_for_each_entry(pos, &thermal_tz_list, node) if (atomic_cmpxchg(&pos->need_update, 1, 0)) thermal_zone_device_update(pos, THERMAL_EVENT_UNSPECIFIED); mutex_unlock(&thermal_list_lock); return cdev; out_cooling_dev: thermal_cooling_device_destroy_sysfs(cdev); out_cdev_type: kfree_const(cdev->type); out_ida_remove: ida_free(&thermal_cdev_ida, id); out_kfree_cdev: kfree(cdev); return ERR_PTR(ret); } /** * thermal_cooling_device_register() - register a new thermal cooling device * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(NULL, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_cooling_device_register); /** * thermal_of_cooling_device_register() - register an OF thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_of_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(np, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_of_cooling_device_register); static void thermal_cooling_device_release(struct device *dev, void *res) { thermal_cooling_device_unregister( *(struct thermal_cooling_device **)res); } /** * devm_thermal_of_cooling_device_register() - register an OF thermal cooling * device * @dev: a valid struct device pointer of a sensor device. * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, struct device_node *np, char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device **ptr, *tcd; ptr = devres_alloc(thermal_cooling_device_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); tcd = __thermal_cooling_device_register(np, type, devdata, ops); if (IS_ERR(tcd)) { devres_free(ptr); return tcd; } *ptr = tcd; devres_add(dev, ptr); return tcd; } EXPORT_SYMBOL_GPL(devm_thermal_of_cooling_device_register); static bool thermal_cooling_device_present(struct thermal_cooling_device *cdev) { struct thermal_cooling_device *pos = NULL; list_for_each_entry(pos, &thermal_cdev_list, node) { if (pos == cdev) return true; } return false; } /** * thermal_cooling_device_update - Update a cooling device object * @cdev: Target cooling device. * * Update @cdev to reflect a change of the underlying hardware or platform. * * Must be called when the maximum cooling state of @cdev becomes invalid and so * its .get_max_state() callback needs to be run to produce the new maximum * cooling state value. */ void thermal_cooling_device_update(struct thermal_cooling_device *cdev) { struct thermal_instance *ti; unsigned long state; if (IS_ERR_OR_NULL(cdev)) return; /* * Hold thermal_list_lock throughout the update to prevent the device * from going away while being updated. */ mutex_lock(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) goto unlock_list; /* * Update under the cdev lock to prevent the state from being set beyond * the new limit concurrently. */ mutex_lock(&cdev->lock); if (cdev->ops->get_max_state(cdev, &cdev->max_state)) goto unlock; thermal_cooling_device_stats_reinit(cdev); list_for_each_entry(ti, &cdev->thermal_instances, cdev_node) { if (ti->upper == cdev->max_state) continue; if (ti->upper < cdev->max_state) { if (ti->upper_no_limit) ti->upper = cdev->max_state; continue; } ti->upper = cdev->max_state; if (ti->lower > ti->upper) ti->lower = ti->upper; if (ti->target == THERMAL_NO_TARGET) continue; if (ti->target > ti->upper) ti->target = ti->upper; } if (cdev->ops->get_cur_state(cdev, &state) || state > cdev->max_state) goto unlock; thermal_cooling_device_stats_update(cdev, state); unlock: mutex_unlock(&cdev->lock); unlock_list: mutex_unlock(&thermal_list_lock); } EXPORT_SYMBOL_GPL(thermal_cooling_device_update); /** * thermal_cooling_device_unregister - removes a thermal cooling device * @cdev: the thermal cooling device to remove. * * thermal_cooling_device_unregister() must be called when a registered * thermal cooling device is no longer needed. */ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) { struct thermal_zone_device *tz; if (!cdev) return; thermal_debug_cdev_remove(cdev); mutex_lock(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) { mutex_unlock(&thermal_list_lock); return; } list_del(&cdev->node); /* Unbind all thermal zones associated with 'this' cdev */ list_for_each_entry(tz, &thermal_tz_list, node) { if (tz->ops.unbind) tz->ops.unbind(tz, cdev); } mutex_unlock(&thermal_list_lock); device_unregister(&cdev->device); } EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister); static void bind_tz(struct thermal_zone_device *tz) { int ret; struct thermal_cooling_device *pos = NULL; if (!tz->ops.bind) return; mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_cdev_list, node) { ret = tz->ops.bind(tz, pos); if (ret) print_bind_err_msg(tz, pos, ret); } mutex_unlock(&thermal_list_lock); } static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms) { *delay_jiffies = msecs_to_jiffies(delay_ms); if (delay_ms > 1000) *delay_jiffies = round_jiffies(*delay_jiffies); } int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp) { const struct thermal_trip_desc *td; int ret = -EINVAL; if (tz->ops.get_crit_temp) return tz->ops.get_crit_temp(tz, temp); mutex_lock(&tz->lock); for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; if (trip->type == THERMAL_TRIP_CRITICAL) { *temp = trip->temperature; ret = 0; break; } } mutex_unlock(&tz->lock); return ret; } EXPORT_SYMBOL_GPL(thermal_zone_get_crit_temp); /** * thermal_zone_device_register_with_trips() - register a new thermal zone device * @type: the thermal zone device type * @trips: a pointer to an array of thermal trips * @num_trips: the number of trip points the thermal zone support * @devdata: private device data * @ops: standard thermal zone device callbacks * @tzp: thermal zone platform parameters * @passive_delay: number of milliseconds to wait between polls when * performing passive cooling * @polling_delay: number of milliseconds to wait between polls when checking * whether trip points have been crossed (0 for interrupt * driven systems) * * This interface function adds a new thermal zone device (sensor) to * /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the * thermal cooling devices registered at the same time. * thermal_zone_device_unregister() must be called when the device is no * longer needed. The passive cooling depends on the .get_trend() return value. * * Return: a pointer to the created struct thermal_zone_device or an * in case of error, an ERR_PTR. Caller must check return value with * IS_ERR*() helpers. */ struct thermal_zone_device * thermal_zone_device_register_with_trips(const char *type, const struct thermal_trip *trips, int num_trips, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, int passive_delay, int polling_delay) { const struct thermal_trip *trip = trips; struct thermal_zone_device *tz; struct thermal_trip_desc *td; int id; int result; struct thermal_governor *governor; if (!type || strlen(type) == 0) { pr_err("No thermal zone type defined\n"); return ERR_PTR(-EINVAL); } if (strlen(type) >= THERMAL_NAME_LENGTH) { pr_err("Thermal zone name (%s) too long, should be under %d chars\n", type, THERMAL_NAME_LENGTH); return ERR_PTR(-EINVAL); } if (num_trips < 0) { pr_err("Incorrect number of thermal trips\n"); return ERR_PTR(-EINVAL); } if (!ops || !ops->get_temp) { pr_err("Thermal zone device ops not defined\n"); return ERR_PTR(-EINVAL); } if (num_trips > 0 && !trips) return ERR_PTR(-EINVAL); if (!thermal_class) return ERR_PTR(-ENODEV); tz = kzalloc(struct_size(tz, trips, num_trips), GFP_KERNEL); if (!tz) return ERR_PTR(-ENOMEM); if (tzp) { tz->tzp = kmemdup(tzp, sizeof(*tzp), GFP_KERNEL); if (!tz->tzp) { result = -ENOMEM; goto free_tz; } } INIT_LIST_HEAD(&tz->thermal_instances); INIT_LIST_HEAD(&tz->node); ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; goto free_tzp; } tz->id = id; strscpy(tz->type, type, sizeof(tz->type)); tz->ops = *ops; if (!tz->ops.critical) tz->ops.critical = thermal_zone_device_critical; tz->device.class = thermal_class; tz->devdata = devdata; tz->num_trips = num_trips; for_each_trip_desc(tz, td) td->trip = *trip++; thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay); thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay); /* sys I/F */ /* Add nodes that are always present via .groups */ result = thermal_zone_create_device_groups(tz); if (result) goto remove_id; /* A new thermal zone needs to be updated anyway. */ atomic_set(&tz->need_update, 1); result = dev_set_name(&tz->device, "thermal_zone%d", tz->id); if (result) { thermal_zone_destroy_device_groups(tz); goto remove_id; } result = device_register(&tz->device); if (result) goto release_device; /* Update 'this' zone's governor information */ mutex_lock(&thermal_governor_lock); if (tz->tzp) governor = __find_governor(tz->tzp->governor_name); else governor = def_governor; result = thermal_set_governor(tz, governor); if (result) { mutex_unlock(&thermal_governor_lock); goto unregister; } mutex_unlock(&thermal_governor_lock); if (!tz->tzp || !tz->tzp->no_hwmon) { result = thermal_add_hwmon_sysfs(tz); if (result) goto unregister; } mutex_lock(&thermal_list_lock); mutex_lock(&tz->lock); list_add_tail(&tz->node, &thermal_tz_list); mutex_unlock(&tz->lock); mutex_unlock(&thermal_list_lock); /* Bind cooling devices for this zone */ bind_tz(tz); thermal_zone_device_init(tz); /* Update the new thermal zone and mark it as already updated. */ if (atomic_cmpxchg(&tz->need_update, 1, 0)) thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); thermal_notify_tz_create(tz); thermal_debug_tz_add(tz); return tz; unregister: device_del(&tz->device); release_device: put_device(&tz->device); remove_id: ida_free(&thermal_tz_ida, id); free_tzp: kfree(tz->tzp); free_tz: kfree(tz); return ERR_PTR(result); } EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips); struct thermal_zone_device *thermal_tripless_zone_device_register( const char *type, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp) { return thermal_zone_device_register_with_trips(type, NULL, 0, devdata, ops, tzp, 0, 0); } EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register); void *thermal_zone_device_priv(struct thermal_zone_device *tzd) { return tzd->devdata; } EXPORT_SYMBOL_GPL(thermal_zone_device_priv); const char *thermal_zone_device_type(struct thermal_zone_device *tzd) { return tzd->type; } EXPORT_SYMBOL_GPL(thermal_zone_device_type); int thermal_zone_device_id(struct thermal_zone_device *tzd) { return tzd->id; } EXPORT_SYMBOL_GPL(thermal_zone_device_id); struct device *thermal_zone_device(struct thermal_zone_device *tzd) { return &tzd->device; } EXPORT_SYMBOL_GPL(thermal_zone_device); /** * thermal_zone_device_unregister - removes the registered thermal zone device * @tz: the thermal zone device to remove */ void thermal_zone_device_unregister(struct thermal_zone_device *tz) { struct thermal_cooling_device *cdev; struct thermal_zone_device *pos = NULL; if (!tz) return; thermal_debug_tz_remove(tz); mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) if (pos == tz) break; if (pos != tz) { /* thermal zone device not found */ mutex_unlock(&thermal_list_lock); return; } mutex_lock(&tz->lock); list_del(&tz->node); mutex_unlock(&tz->lock); /* Unbind all cdevs associated with 'this' thermal zone */ list_for_each_entry(cdev, &thermal_cdev_list, node) if (tz->ops.unbind) tz->ops.unbind(tz, cdev); mutex_unlock(&thermal_list_lock); cancel_delayed_work_sync(&tz->poll_queue); thermal_set_governor(tz, NULL); thermal_remove_hwmon_sysfs(tz); ida_free(&thermal_tz_ida, tz->id); ida_destroy(&tz->ida); device_del(&tz->device); kfree(tz->tzp); put_device(&tz->device); thermal_notify_tz_delete(tz); wait_for_completion(&tz->removal); kfree(tz); } EXPORT_SYMBOL_GPL(thermal_zone_device_unregister); /** * thermal_zone_get_zone_by_name() - search for a zone and returns its ref * @name: thermal zone name to fetch the temperature * * When only one zone is found with the passed name, returns a reference to it. * * Return: On success returns a reference to an unique thermal zone with * matching name equals to @name, an ERR_PTR otherwise (-EINVAL for invalid * paramenters, -ENODEV for not found and -EEXIST for multiple matches). */ struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name) { struct thermal_zone_device *pos = NULL, *ref = ERR_PTR(-EINVAL); unsigned int found = 0; if (!name) goto exit; mutex_lock(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) if (!strncasecmp(name, pos->type, THERMAL_NAME_LENGTH)) { found++; ref = pos; } mutex_unlock(&thermal_list_lock); /* nothing has been found, thus an error code for it */ if (found == 0) ref = ERR_PTR(-ENODEV); else if (found > 1) /* Success only when an unique zone is found */ ref = ERR_PTR(-EEXIST); exit: return ref; } EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name); static void thermal_zone_device_resume(struct work_struct *work) { struct thermal_zone_device *tz; tz = container_of(work, struct thermal_zone_device, poll_queue.work); mutex_lock(&tz->lock); tz->suspended = false; thermal_zone_device_init(tz); __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); mutex_unlock(&tz->lock); } static int thermal_pm_notify(struct notifier_block *nb, unsigned long mode, void *_unused) { struct thermal_zone_device *tz; switch (mode) { case PM_HIBERNATION_PREPARE: case PM_RESTORE_PREPARE: case PM_SUSPEND_PREPARE: mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { mutex_lock(&tz->lock); tz->suspended = true; mutex_unlock(&tz->lock); } mutex_unlock(&thermal_list_lock); break; case PM_POST_HIBERNATION: case PM_POST_RESTORE: case PM_POST_SUSPEND: mutex_lock(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { mutex_lock(&tz->lock); cancel_delayed_work(&tz->poll_queue); /* * Replace the work function with the resume one, which * will restore the original work function and schedule * the polling work if needed. */ INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_resume); /* Queue up the work without a delay. */ mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, 0); mutex_unlock(&tz->lock); } mutex_unlock(&thermal_list_lock); break; default: break; } return 0; } static struct notifier_block thermal_pm_nb = { .notifier_call = thermal_pm_notify, }; static int __init thermal_init(void) { int result; thermal_debug_init(); result = thermal_netlink_init(); if (result) goto error; result = thermal_register_governors(); if (result) goto unregister_netlink; thermal_class = kzalloc(sizeof(*thermal_class), GFP_KERNEL); if (!thermal_class) { result = -ENOMEM; goto unregister_governors; } thermal_class->name = "thermal"; thermal_class->dev_release = thermal_release; result = class_register(thermal_class); if (result) { kfree(thermal_class); thermal_class = NULL; goto unregister_governors; } result = register_pm_notifier(&thermal_pm_nb); if (result) pr_warn("Thermal: Can not register suspend notifier, return %d\n", result); return 0; unregister_governors: thermal_unregister_governors(); unregister_netlink: thermal_netlink_exit(); error: mutex_destroy(&thermal_list_lock); mutex_destroy(&thermal_governor_lock); return result; } postcore_initcall(thermal_init); |
4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. */ #include "rxe.h" #include "rxe_hw_counters.h" static const struct rdma_stat_desc rxe_counter_descs[] = { [RXE_CNT_SENT_PKTS].name = "sent_pkts", [RXE_CNT_RCVD_PKTS].name = "rcvd_pkts", [RXE_CNT_DUP_REQ].name = "duplicate_request", [RXE_CNT_OUT_OF_SEQ_REQ].name = "out_of_seq_request", [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", [RXE_CNT_SND_RNR].name = "send_rnr_err", [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", [RXE_CNT_COMPLETER_SCHED].name = "ack_deferred", [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", [RXE_CNT_COMP_RETRY].name = "completer_retry_err", [RXE_CNT_SEND_ERR].name = "send_err", [RXE_CNT_LINK_DOWNED].name = "link_downed", [RXE_CNT_RDMA_SEND].name = "rdma_sends", [RXE_CNT_RDMA_RECV].name = "rdma_recvs", }; int rxe_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index) { struct rxe_dev *dev = to_rdev(ibdev); unsigned int cnt; if (!port || !stats) return -EINVAL; for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_descs); cnt++) stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]); return ARRAY_SIZE(rxe_counter_descs); } struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_descs) != RXE_NUM_OF_COUNTERS); return rdma_alloc_hw_stats_struct(rxe_counter_descs, ARRAY_SIZE(rxe_counter_descs), RDMA_HW_STATS_DEFAULT_LIFESPAN); } |
6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match connection tracking information. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_state.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module"); MODULE_ALIAS("ipt_state"); MODULE_ALIAS("ip6t_state"); static bool state_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_state_info *sinfo = par->matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct) statebit = XT_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) statebit = XT_STATE_UNTRACKED; else statebit = XT_STATE_INVALID; return (sinfo->statemask & statebit); } static int state_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void state_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match state_mt_reg __read_mostly = { .name = "state", .family = NFPROTO_UNSPEC, .checkentry = state_mt_check, .match = state_mt, .destroy = state_mt_destroy, .matchsize = sizeof(struct xt_state_info), .me = THIS_MODULE, }; static int __init state_mt_init(void) { return xt_register_match(&state_mt_reg); } static void __exit state_mt_exit(void) { xt_unregister_match(&state_mt_reg); } module_init(state_mt_init); module_exit(state_mt_exit); |
1 1 1 41 13 34 35 35 24 24 14 12 9 3 12 1 12 12 37 32 12 12 41 41 41 41 37 17 17 2 35 35 36 19 15 4 19 22 6 7 20 5 11 11 22 22 22 12 19 21 22 13 13 12 11 15 15 15 1 1 36 36 36 32 6 34 36 36 23 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Red Hat. All rights reserved. */ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/math64.h> #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> #include "ctree.h" #include "fs.h" #include "messages.h" #include "misc.h" #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" #include "extent_io.h" #include "space-info.h" #include "block-group.h" #include "discard.h" #include "subpage.h" #include "inode-item.h" #include "accessors.h" #include "file-item.h" #include "file.h" #include "super.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K #define FORCE_EXTENT_THRESHOLD SZ_1M static struct kmem_cache *btrfs_free_space_cachep; static struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_trim_range { u64 start; u64 bytes; struct list_head list; }; static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat); static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc); static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info); static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); static void btrfs_crc32c_final(u32 crc, u8 *result) { put_unaligned_le32(~crc, result); } static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; struct rb_node *node; while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); if (!info->bitmap) { unlink_free_space(ctl, info, true); kmem_cache_free(btrfs_free_space_cachep, info); } else { free_bitmap(ctl, info); } cond_resched_lock(&ctl->tree_lock); } } static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_key location; struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct inode *inode = NULL; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ERR_PTR(ret); if (ret > 0) { btrfs_release_path(path); return ERR_PTR(-ENOENT); } leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_free_space_key(leaf, header, &disk_key); btrfs_disk_key_to_cpu(&location, &disk_key); btrfs_release_path(path); /* * We are often under a trans handle at this point, so we need to make * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) return inode; mapping_set_gfp_mask(inode->i_mapping, mapping_gfp_constraint(inode->i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); return inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct inode *inode = NULL; u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); if (block_group->inode) inode = igrab(block_group->inode); spin_unlock(&block_group->lock); if (inode) return inode; inode = __lookup_free_space_inode(fs_info->tree_root, path, block_group->start); if (IS_ERR(inode)) return inode; spin_lock(&block_group->lock); if (!((BTRFS_I(inode)->flags & flags) == flags)) { btrfs_info(fs_info, "Old style space inode found, converting."); BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; } if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) block_group->inode = igrab(inode); spin_unlock(&block_group->lock); return inode; } static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 ino, u64 offset) { struct btrfs_key key; struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; /* We inline CRCs for the free disk space cache */ const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); btrfs_item_key(leaf, &disk_key, path->slots[0]); memzero_extent_buffer(leaf, (unsigned long)inode_item, sizeof(*inode_item)); btrfs_set_inode_generation(leaf, inode_item, trans->transid); btrfs_set_inode_size(leaf, inode_item, 0); btrfs_set_inode_nbytes(leaf, inode_item, 0); btrfs_set_inode_uid(leaf, inode_item, 0); btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, inode_item, flags); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { btrfs_release_path(path); return ret; } leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; } int create_free_space_inode(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; u64 ino; ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; return __create_free_space_inode(trans->fs_info->tree_root, trans, path, ino, block_group->start); } /* * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode * handles lookup, otherwise it takes ownership and iputs the inode. * Don't reuse an inode pointer after passing it into this function. */ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { struct btrfs_path *path; struct btrfs_key key; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (!inode) inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); goto out; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); goto out; } clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { block_group->inode = NULL; spin_unlock(&block_group->lock); iput(inode); } else { spin_unlock(&block_group->lock); } /* One for the lookup ref */ btrfs_add_delayed_iput(BTRFS_I(inode)); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; key.offset = block_group->start; ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, -1, 1); if (ret) { if (ret > 0) ret = 0; goto out; } ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); out: btrfs_free_path(path); return ret; } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *vfs_inode) { struct btrfs_truncate_control control = { .inode = BTRFS_I(vfs_inode), .new_size = 0, .ino = btrfs_ino(BTRFS_I(vfs_inode)), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL; int ret = 0; bool locked = false; if (block_group) { struct btrfs_path *path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; } locked = true; mutex_lock(&trans->transaction->cache_write_mutex); if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); btrfs_wait_cache_io(trans, block_group, path); btrfs_put_block_group(block_group); } /* * now that we've truncated the cache away, its no longer * setup or written */ spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); btrfs_free_path(path); } btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ ret = btrfs_truncate_inode_items(trans, root, &control); inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; ret = btrfs_update_inode(trans, inode); fail: if (locked) mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, ret); return ret; } static void readahead_cache(struct inode *inode) { struct file_ra_state ra; unsigned long last_index; file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); } static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, int write) { int num_pages; num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); /* Make sure we can fit our crcs and generation into the first page */ if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; io_ctl->num_pages = num_pages; io_ctl->fs_info = inode_to_fs_info(inode); io_ctl->inode = inode; return 0; } ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { kfree(io_ctl->pages); io_ctl->pages = NULL; } static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) { if (io_ctl->cur) { io_ctl->cur = NULL; io_ctl->orig = NULL; } } static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) { ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_SIZE; if (clear) clear_page(io_ctl->cur); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) { int i; io_ctl_unmap_page(io_ctl); for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { btrfs_folio_clear_checked(io_ctl->fs_info, page_folio(io_ctl->pages[i]), page_offset(io_ctl->pages[i]), PAGE_SIZE); unlock_page(io_ctl->pages[i]); put_page(io_ctl->pages[i]); } } } static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { struct page *page; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; for (i = 0; i < io_ctl->num_pages; i++) { int ret; page = find_or_create_page(inode->i_mapping, i, mask); if (!page) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); put_page(page); io_ctl_drop_pages(io_ctl); return ret; } io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } if (!PageUptodate(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); return -EIO; } } } for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); return 0; } static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { io_ctl_map_page(io_ctl, 1); /* * Skip the csum areas. If we don't check crcs then we just have a * 64bit chunk at the front of the first page. */ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit * chunk at the front of the first page. */ io_ctl->cur += sizeof(u32) * io_ctl->num_pages; io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); cache_gen = get_unaligned_le64(io_ctl->cur); if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } io_ctl->cur += sizeof(u64); return 0; } static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp; u32 crc = ~(u32)0; unsigned offset = 0; if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; *tmp = crc; } static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp, val; u32 crc = ~(u32)0; unsigned offset = 0; if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; tmp = page_address(io_ctl->pages[0]); tmp += index; val = *tmp; io_ctl_map_page(io_ctl, 0); crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); io_ctl_unmap_page(io_ctl); return -EIO; } return 0; } static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, void *bitmap) { struct btrfs_free_space_entry *entry; if (!io_ctl->cur) return -ENOSPC; entry = io_ctl->cur; put_unaligned_le64(offset, &entry->offset); put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) return 0; io_ctl_set_crc(io_ctl, io_ctl->index - 1); /* No more pages to map */ if (io_ctl->index >= io_ctl->num_pages) return 0; /* map the next page */ io_ctl_map_page(io_ctl, 1); return 0; } static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) { if (!io_ctl->cur) return -ENOSPC; /* * If we aren't at the start of the current page, unmap this one and * map the next one if there is any left. */ if (io_ctl->cur != io_ctl->orig) { io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index >= io_ctl->num_pages) return -ENOSPC; io_ctl_map_page(io_ctl, 0); } copy_page(io_ctl->cur, bitmap); io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index < io_ctl->num_pages) io_ctl_map_page(io_ctl, 0); return 0; } static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) { /* * If we're not on the boundary we know we've modified the page and we * need to crc the page. */ if (io_ctl->cur != io_ctl->orig) io_ctl_set_crc(io_ctl, io_ctl->index - 1); else io_ctl_unmap_page(io_ctl); while (io_ctl->index < io_ctl->num_pages) { io_ctl_map_page(io_ctl, 1); io_ctl_set_crc(io_ctl, io_ctl->index - 1); } } static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; int ret; if (!io_ctl->cur) { ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; } e = io_ctl->cur; entry->offset = get_unaligned_le64(&e->offset); entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) return 0; io_ctl_unmap_page(io_ctl); return 0; } static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry) { int ret; ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; copy_page(entry->bitmap, io_ctl->cur); io_ctl_unmap_page(io_ctl); return 0; } static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { struct btrfs_block_group *block_group = ctl->block_group; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->length; u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); max_bitmaps = max_t(u64, max_bitmaps, 1); if (ctl->total_bitmaps > max_bitmaps) btrfs_err(block_group->fs_info, "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", block_group->start, block_group->length, ctl->total_bitmaps, ctl->unit, max_bitmaps, bytes_per_bg); ASSERT(ctl->total_bitmaps <= max_bitmaps); /* * We are trying to keep the total amount of memory used per 1GiB of * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of * bitmaps, we may end up using more memory than this. */ if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); bitmap_bytes = ctl->total_bitmaps * ctl->unit; /* * we want the extent entry threshold to always be at most 1/2 the max * bytes we can have, or whatever is less than that. */ extent_bytes = max_bytes - bitmap_bytes; extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); ctl->extents_thresh = div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct btrfs_io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; LIST_HEAD(bitmaps); u64 num_entries; u64 num_bitmaps; u64 generation; u8 type; int ret = 0; /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return 0; else if (ret > 0) { btrfs_release_path(path); return 0; } ret = -1; leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); num_entries = btrfs_free_space_entries(leaf, header); num_bitmaps = btrfs_free_space_bitmaps(leaf, header); generation = btrfs_free_space_generation(leaf, header); btrfs_release_path(path); if (!BTRFS_I(inode)->generation) { btrfs_info(fs_info, "the free space cache file (%llu) is invalid, skip it", offset); return 0; } if (BTRFS_I(inode)->generation != generation) { btrfs_err(fs_info, "free space inode generation (%llu) did not match free space cache generation (%llu)", BTRFS_I(inode)->generation, generation); return 0; } if (!num_entries) return 0; ret = io_ctl_init(&io_ctl, inode, 0); if (ret) return ret; readahead_cache(inode); ret = io_ctl_prepare_pages(&io_ctl, true); if (ret) goto out; ret = io_ctl_check_crc(&io_ctl, 0); if (ret) goto free_cache; ret = io_ctl_check_generation(&io_ctl, generation); if (ret) goto free_cache; while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!e) { ret = -ENOMEM; goto free_cache; } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } if (!e->bytes) { ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } if (type == BTRFS_FREE_SPACE_EXTENT) { spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); spin_unlock(&ctl->tree_lock); if (ret) { btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } } else { ASSERT(num_bitmaps); num_bitmaps--; e->bitmap = kmem_cache_zalloc( btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; } spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); if (ret) { spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } ctl->total_bitmaps++; recalculate_thresholds(ctl); spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); } num_entries--; } io_ctl_unmap_page(&io_ctl); /* * We add the bitmaps at the end of the entries in order that * the bitmap entries are added to the cache. */ list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; } io_ctl_drop_pages(&io_ctl); ret = 1; out: io_ctl_free(&io_ctl); return ret; free_cache: io_ctl_drop_pages(&io_ctl); spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache(ctl); spin_unlock(&ctl->tree_lock); goto out; } static int copy_free_space_cache(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; struct rb_node *n; int ret = 0; while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { const u64 offset = info->offset; const u64 bytes = info->bytes; unlink_free_space(ctl, info, true); spin_unlock(&ctl->tree_lock); kmem_cache_free(btrfs_free_space_cachep, info); ret = btrfs_add_free_space(block_group, offset, bytes); spin_lock(&ctl->tree_lock); } else { u64 offset = info->offset; u64 bytes = ctl->unit; ret = search_bitmap(ctl, info, &offset, &bytes, false); if (ret == 0) { bitmap_clear_bits(ctl, info, offset, bytes, true); spin_unlock(&ctl->tree_lock); ret = btrfs_add_free_space(block_group, offset, bytes); spin_lock(&ctl->tree_lock); } else { free_bitmap(ctl, info); ret = 0; } } cond_resched_lock(&ctl->tree_lock); } return ret; } static struct lock_class_key btrfs_free_space_inode_key; int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space_ctl tmp_ctl = {}; struct inode *inode; struct btrfs_path *path; int ret = 0; bool matched; u64 used = block_group->used; /* * Because we could potentially discard our loaded free space, we want * to load everything into a temporary structure first, and then if it's * valid copy it all into the actual free space ctl. */ btrfs_init_free_space_ctl(block_group, &tmp_ctl); /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. */ spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); return 0; } spin_unlock(&block_group->lock); path = btrfs_alloc_path(); if (!path) return 0; path->search_commit_root = 1; path->skip_locking = 1; /* * We must pass a path with search_commit_root set to btrfs_iget in * order to avoid a deadlock when allocating extents for the tree root. * * When we are COWing an extent buffer from the tree root, when looking * for a free extent, at extent-tree.c:find_free_extent(), we can find * block group without its free space cache loaded. When we find one * we must load its space cache which requires reading its free space * cache's inode item from the root tree. If this inode item is located * in the same leaf that we started COWing before, then we end up in * deadlock on the extent buffer (trying to read lock it when we * previously write locked it). * * It's safe to read the inode item using the commit root because * block groups, once loaded, stay in memory forever (until they are * removed) as well as their space caches once loaded. New block groups * once created get their ->cached field set to BTRFS_CACHE_FINISHED so * we will never try to read their inode item while the fs is mounted. */ inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); return 0; } /* We may have converted the inode and made the cache invalid. */ spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); btrfs_free_path(path); goto out; } spin_unlock(&block_group->lock); /* * Reinitialize the class of struct inode's mapping->invalidate_lock for * free space inodes to prevent false positives related to locks for normal * inodes. */ lockdep_set_class(&(&inode->i_data)->invalidate_lock, &btrfs_free_space_inode_key); ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; matched = (tmp_ctl.free_space == (block_group->length - used - block_group->bytes_super)); if (matched) { spin_lock(&tmp_ctl.tree_lock); ret = copy_free_space_cache(block_group, &tmp_ctl); spin_unlock(&tmp_ctl.tree_lock); /* * ret == 1 means we successfully loaded the free space cache, * so we need to re-set it here. */ if (ret == 0) ret = 1; } else { /* * We need to call the _locked variant so we don't try to update * the discard counters. */ spin_lock(&tmp_ctl.tree_lock); __btrfs_remove_free_space_cache(&tmp_ctl); spin_unlock(&tmp_ctl.tree_lock); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); ret = -1; } out: if (ret < 0) { /* This cache is bogus, make sure it gets cleared */ spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); ret = 0; btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now", block_group->start); } spin_lock(&ctl->tree_lock); btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); iput(inode); return ret; } static noinline_for_stack int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, int *entries, int *bitmaps, struct list_head *bitmap_list) { int ret; struct btrfs_free_cluster *cluster = NULL; struct btrfs_free_cluster *cluster_locked = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); struct btrfs_trim_range *trim_entry; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { cluster = list_entry(block_group->cluster_list.next, struct btrfs_free_cluster, block_group_list); } if (!node && cluster) { cluster_locked = cluster; spin_lock(&cluster_locked->lock); node = rb_first(&cluster->root); cluster = NULL; } /* Write out the extent entries */ while (node) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); *entries += 1; ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, e->bitmap); if (ret) goto fail; if (e->bitmap) { list_add_tail(&e->list, bitmap_list); *bitmaps += 1; } node = rb_next(node); if (!node && cluster) { node = rb_first(&cluster->root); cluster_locked = cluster; spin_lock(&cluster_locked->lock); cluster = NULL; } } if (cluster_locked) { spin_unlock(&cluster_locked->lock); cluster_locked = NULL; } /* * Make sure we don't miss any range that was removed from our rbtree * because trimming is running. Otherwise after a umount+mount (or crash * after committing the transaction) we would leak free space and get * an inconsistent free space cache report from fsck. */ list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { ret = io_ctl_add_entry(io_ctl, trim_entry->start, trim_entry->bytes, NULL); if (ret) goto fail; *entries += 1; } return 0; fail: if (cluster_locked) spin_unlock(&cluster_locked->lock); return -ENOSPC; } static noinline_for_stack int update_cache_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 offset, int entries, int bitmaps) { struct btrfs_key key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; if (ret > 0) { struct btrfs_key found_key; ASSERT(path->slots[0]); path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); btrfs_release_path(path); goto fail; } } BTRFS_I(inode)->generation = trans->transid; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; fail: return -1; } static noinline_for_stack int write_pinned_extent_entries( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) { u64 start, extent_start, extent_end, len; struct extent_io_tree *unpin = NULL; int ret; if (!block_group) return 0; /* * We want to add any pinned extents to our free space cache * so we don't leak the space * * We shouldn't have switched the pinned extents yet so this is the * right one */ unpin = &trans->transaction->pinned_extents; start = block_group->start; while (start < block_group->start + block_group->length) { if (!find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ if (extent_start >= block_group->start + block_group->length) return 0; extent_start = max(extent_start, start); extent_end = min(block_group->start + block_group->length, extent_end + 1); len = extent_end - extent_start; *entries += 1; ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); if (ret) return -ENOSPC; start = extent_end; } return 0; } static noinline_for_stack int write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { struct btrfs_free_space *entry, *next; int ret; /* Write out the bitmaps */ list_for_each_entry_safe(entry, next, bitmap_list, list) { ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) return -ENOSPC; list_del_init(&entry->list); } return 0; } static int flush_dirty_cache(struct inode *inode) { int ret; ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); return ret; } static void noinline_for_stack cleanup_bitmap_list(struct list_head *bitmap_list) { struct btrfs_free_space *entry, *next; list_for_each_entry_safe(entry, next, bitmap_list, list) list_del_init(&entry->list); } static void noinline_for_stack cleanup_write_cache_enospc(struct inode *inode, struct btrfs_io_ctl *io_ctl, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_path *path, u64 offset) { int ret; struct inode *inode = io_ctl->inode; if (!inode) return 0; /* Flush the dirty pages in the cache file. */ ret = flush_dirty_cache(inode); if (ret) goto out; /* Update the cache item to tell everyone this cache file is valid. */ ret = update_cache_item(trans, root, inode, path, offset, io_ctl->entries, io_ctl->bitmaps); out: if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; if (block_group) btrfs_debug(root->fs_info, "failed to write free space cache for block group %llu error %d", block_group->start, ret); } btrfs_update_inode(trans, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ spin_lock(&trans->transaction->dirty_bgs_lock); /* the disk_cache_state is protected by the block group lock */ spin_lock(&block_group->lock); /* * only mark this as written if we didn't get put back on * the dirty list while waiting for IO. Otherwise our * cache state won't be right, and we won't get written again */ if (!ret && list_empty(&block_group->dirty_list)) block_group->disk_cache_state = BTRFS_DC_WRITTEN; else if (ret) block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); spin_unlock(&trans->transaction->dirty_bgs_lock); io_ctl->inode = NULL; iput(inode); } return ret; } int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, block_group, &block_group->io_ctl, path, block_group->start); } /* * Write out cached info to an inode. * * @inode: freespace inode we are writing out * @ctl: free space cache we are going to write out * @block_group: block_group for this cache if it belongs to a block_group * @io_ctl: holds context for the io * @trans: the trans handle * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; int bitmaps = 0; int ret; int must_iput = 0; if (!i_size_read(inode)) return -EIO; WARN_ON(io_ctl->pages); ret = io_ctl_init(io_ctl, inode, 1); if (ret) return ret; if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { down_write(&block_group->data_rwsem); spin_lock(&block_group->lock); if (block_group->delalloc_bytes) { block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); up_write(&block_group->data_rwsem); BTRFS_I(inode)->generation = 0; ret = 0; must_iput = 1; goto out; } spin_unlock(&block_group->lock); } /* Lock all pages first so we can lock the extent safely. */ ret = io_ctl_prepare_pages(io_ctl, false); if (ret) goto out_unlock; lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); io_ctl_set_generation(io_ctl, trans->transid); mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ spin_lock(&ctl->tree_lock); ret = write_cache_extent_entries(io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); if (ret) goto out_nospc_locked; /* * Some spaces that are freed in the current transaction are pinned, * they will be added into free space cache after the transaction is * committed, we shouldn't lose them. * * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; /* * At last, we write out all the bitmaps and keep cache_writeout_mutex * locked while doing it because a concurrent trim can be manipulating * or freeing the bitmap. */ ret = write_bitmap_entries(io_ctl, &bitmap_list); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; /* Zero out the rest of the pages just to make sure */ io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), &cached_state, false); if (ret) goto out_nospc; if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); /* * Release the pages and unlock the extent, we will flush * them out later */ io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); /* * at this point the pages are under IO and we're happy, * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; io_ctl->bitmaps = bitmaps; ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); if (ret) goto out; return 0; out_nospc_locked: cleanup_bitmap_list(&bitmap_list); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state); out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); out: io_ctl->inode = NULL; io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; } int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; spin_lock(&block_group->lock); if (block_group->disk_cache_state < BTRFS_DC_SETUP) { spin_unlock(&block_group->lock); return 0; } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) return 0; ret = __btrfs_write_out_cache(inode, ctl, block_group, &block_group->io_ctl, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", block_group->start, ret); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); block_group->io_ctl.inode = NULL; iput(inode); } /* * if ret == 0 the caller is expected to call btrfs_wait_cache_io * to wait for IO and put the inode */ return ret; } static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, u64 offset) { ASSERT(offset >= bitmap_start); offset -= bitmap_start; return (unsigned long)(div_u64(offset, unit)); } static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) { return (unsigned long)(div_u64(bytes, unit)); } static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; u64 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; return bitmap_start; } static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, struct btrfs_free_cluster *cluster, struct btrfs_free_space *new_entry) { struct rb_root *root; struct rb_node **p; struct rb_node *parent = NULL; lockdep_assert_held(&ctl->tree_lock); if (cluster) { lockdep_assert_held(&cluster->lock); root = &cluster->root; } else { root = &ctl->free_space_offset; } p = &root->rb_node; while (*p) { struct btrfs_free_space *info; parent = *p; info = rb_entry(parent, struct btrfs_free_space, offset_index); if (new_entry->offset < info->offset) { p = &(*p)->rb_left; } else if (new_entry->offset > info->offset) { p = &(*p)->rb_right; } else { /* * we could have a bitmap entry and an extent entry * share the same offset. If this is the case, we want * the extent entry to always be found first if we do a * linear search through the tree, since we want to have * the quickest allocation time, and allocating from an * extent is faster than allocating from a bitmap. So * if we're inserting a bitmap and we find an entry at * this offset, we want to go right, or after this entry * logically. If we are inserting an extent and we've * found a bitmap, we want to go left, or before * logically. */ if (new_entry->bitmap) { if (info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; } p = &(*p)->rb_right; } else { if (!info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; } p = &(*p)->rb_left; } } } rb_link_node(&new_entry->offset_index, parent, p); rb_insert_color(&new_entry->offset_index, root); return 0; } /* * This is a little subtle. We *only* have ->max_extent_size set if we actually * searched through the bitmap and figured out the largest ->max_extent_size, * otherwise it's 0. In the case that it's 0 we don't want to tell the * allocator the wrong thing, we want to use the actual real max_extent_size * we've found already if it's larger, or we want to use ->bytes. * * This matters because find_free_space() will skip entries who's ->bytes is * less than the required bytes. So if we didn't search down this bitmap, we * may pick some previous entry that has a smaller ->max_extent_size than we * have. For example, assume we have two entries, one that has * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will * call into find_free_space(), and return with max_extent_size == 4K, because * that first bitmap entry had ->max_extent_size set, but the second one did * not. If instead we returned 8K we'd come in searching for 8K, and find the * 8K contiguous range. * * Consider the other case, we have 2 8K chunks in that second entry and still * don't have ->max_extent_size set. We'll return 16K, and the next time the * allocator comes in it'll fully search our second bitmap, and this time it'll * get an uptodate value of 8K as the maximum chunk size. Then we'll get the * right allocation the next loop through. */ static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) { if (entry->bitmap && entry->max_extent_size) return entry->max_extent_size; return entry->bytes; } /* * We want the largest entry to be leftmost, so this is inverted from what you'd * normally expect. */ static bool entry_less(struct rb_node *node, const struct rb_node *parent) { const struct btrfs_free_space *entry, *exist; entry = rb_entry(node, struct btrfs_free_space, bytes_index); exist = rb_entry(parent, struct btrfs_free_space, bytes_index); return get_max_extent_size(exist) < get_max_extent_size(entry); } /* * searches the tree for the given offset. * * fuzzy - If this is set, then we are trying to make an allocation, and we just * want a section that has at least bytes size and comes at or after the given * offset. */ static struct btrfs_free_space * tree_search_offset(struct btrfs_free_space_ctl *ctl, u64 offset, int bitmap_only, int fuzzy) { struct rb_node *n = ctl->free_space_offset.rb_node; struct btrfs_free_space *entry = NULL, *prev = NULL; lockdep_assert_held(&ctl->tree_lock); /* find entry that is closest to the 'offset' */ while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); prev = entry; if (offset < entry->offset) n = n->rb_left; else if (offset > entry->offset) n = n->rb_right; else break; entry = NULL; } if (bitmap_only) { if (!entry) return NULL; if (entry->bitmap) return entry; /* * bitmap entry and extent entry may share same offset, * in that case, bitmap entry comes after extent entry. */ n = rb_next(n); if (!n) return NULL; entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->offset != offset) return NULL; WARN_ON(!entry->bitmap); return entry; } else if (entry) { if (entry->bitmap) { /* * if previous extent entry covers the offset, * we should return it instead of the bitmap entry */ n = rb_prev(&entry->offset_index); if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); if (!prev->bitmap && prev->offset + prev->bytes > offset) entry = prev; } } return entry; } if (!prev) return NULL; /* find last entry before the 'offset' */ entry = prev; if (entry->offset > offset) { n = rb_prev(&entry->offset_index); if (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); ASSERT(entry->offset <= offset); } else { if (fuzzy) return entry; else return NULL; } } if (entry->bitmap) { n = rb_prev(&entry->offset_index); if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); if (!prev->bitmap && prev->offset + prev->bytes > offset) return prev; } if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) return entry; } else if (entry->offset + entry->bytes > offset) return entry; if (!fuzzy) return NULL; while (1) { n = rb_next(&entry->offset_index); if (!n) return NULL; entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) break; } else { if (entry->offset + entry->bytes > offset) break; } } return entry; } static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { lockdep_assert_held(&ctl->tree_lock); rb_erase(&info->offset_index, &ctl->free_space_offset); rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; } if (update_stat) ctl->free_space -= info->bytes; } static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { int ret = 0; lockdep_assert_held(&ctl->tree_lock); ASSERT(info->bytes || info->bitmap); ret = tree_insert_offset(ctl, NULL, info); if (ret) return ret; rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; } ctl->free_space += info->bytes; ctl->free_extents++; return ret; } static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { ASSERT(info->bitmap); /* * If our entry is empty it's because we're on a cluster and we don't * want to re-link it into our ctl bytes index. */ if (RB_EMPTY_NODE(&info->bytes_index)) return; lockdep_assert_held(&ctl->tree_lock); rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); } static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stat) { unsigned long start, count, end; int extent_delta = -1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); end = start + count; ASSERT(end <= BITS_PER_BITMAP); bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; relink_bitmap_entry(ctl, info); if (start && test_bit(start - 1, info->bitmap)) extent_delta++; if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) extent_delta++; info->bitmap_extents += extent_delta; if (!btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; } if (update_stat) ctl->free_space -= bytes; } static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) { unsigned long start, count, end; int extent_delta = 1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); end = start + count; ASSERT(end <= BITS_PER_BITMAP); bitmap_set(info->bitmap, start, count); /* * We set some bytes, we have no idea what the max extent size is * anymore. */ info->max_extent_size = 0; info->bytes += bytes; ctl->free_space += bytes; relink_bitmap_entry(ctl, info); if (start && test_bit(start - 1, info->bitmap)) extent_delta--; if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) extent_delta--; info->bitmap_extents += extent_delta; if (!btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; } } /* * If we can not find suitable extent, we will use bytes to record * the size of the max extent. */ static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc) { unsigned long found_bits = 0; unsigned long max_bits = 0; unsigned long bits, i; unsigned long next_zero; unsigned long extent_bits; /* * Skip searching the bitmap if we don't have a contiguous section that * is large enough for this allocation. */ if (for_alloc && bitmap_info->max_extent_size && bitmap_info->max_extent_size < *bytes) { *bytes = bitmap_info->max_extent_size; return -1; } i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { if (for_alloc && bits == 1) { found_bits = 1; break; } next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); extent_bits = next_zero - i; if (extent_bits >= bits) { found_bits = extent_bits; break; } else if (extent_bits > max_bits) { max_bits = extent_bits; } i = next_zero; } if (found_bits) { *offset = (u64)(i * ctl->unit) + bitmap_info->offset; *bytes = (u64)(found_bits) * ctl->unit; return 0; } *bytes = (u64)(max_bits) * ctl->unit; bitmap_info->max_extent_size = *bytes; relink_bitmap_entry(ctl, bitmap_info); return -1; } /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, unsigned long align, u64 *max_extent_size, bool use_bytes_index) { struct btrfs_free_space *entry; struct rb_node *node; u64 tmp; u64 align_off; int ret; if (!ctl->free_space_offset.rb_node) goto out; again: if (use_bytes_index) { node = rb_first_cached(&ctl->free_space_bytes); } else { entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); if (!entry) goto out; node = &entry->offset_index; } for (; node; node = rb_next(node)) { if (use_bytes_index) entry = rb_entry(node, struct btrfs_free_space, bytes_index); else entry = rb_entry(node, struct btrfs_free_space, offset_index); /* * If we are using the bytes index then all subsequent entries * in this tree are going to be < bytes, so simply set the max * extent size and exit the loop. * * If we're using the offset index then we need to keep going * through the rest of the tree. */ if (entry->bytes < *bytes) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); if (use_bytes_index) break; continue; } /* make sure the space returned is big enough * to match our requested alignment */ if (*bytes >= align) { tmp = entry->offset - ctl->start + align - 1; tmp = div64_u64(tmp, align); tmp = tmp * align + ctl->start; align_off = tmp - entry->offset; } else { align_off = 0; tmp = entry->offset; } /* * We don't break here if we're using the bytes index because we * may have another entry that has the correct alignment that is * the right size, so we don't want to miss that possibility. * At worst this adds another loop through the logic, but if we * broke here we could prematurely ENOSPC. */ if (entry->bytes < *bytes + align_off) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); continue; } if (entry->bitmap) { struct rb_node *old_next = rb_next(node); u64 size = *bytes; ret = search_bitmap(ctl, entry, &tmp, &size, true); if (!ret) { *offset = tmp; *bytes = size; return entry; } else { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); } /* * The bitmap may have gotten re-arranged in the space * index here because the max_extent_size may have been * updated. Start from the beginning again if this * happened. */ if (use_bytes_index && old_next != rb_next(node)) goto again; continue; } *offset = tmp; *bytes = entry->bytes - align_off; return entry; } out: return NULL; } static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; info->bitmap_extents = 0; INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; recalculate_thresholds(ctl); } static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { /* * Normally when this is called, the bitmap is completely empty. However, * if we are blowing up the free space cache for one reason or another * via __btrfs_remove_free_space_cache(), then it may not be freed and * we may leave stats on the table. */ if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { ctl->discardable_extents[BTRFS_STAT_CURR] -= bitmap_info->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; } unlink_free_space(ctl, bitmap_info, true); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; recalculate_thresholds(ctl); } static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) { u64 end; u64 search_start, search_bytes; int ret; again: end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; /* * We need to search for bits in this bitmap. We could only cover some * of the extent in this bitmap thanks to how we add space, so we need * to search for as much as it as we can and clear that amount, and then * go searching for the next bit. */ search_start = *offset; search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, false); if (ret < 0 || search_start != *offset) return -EINVAL; /* We may have found more bits than what we need */ search_bytes = min(search_bytes, *bytes); /* Cannot clear past the end of the bitmap */ search_bytes = min(search_bytes, end - search_start + 1); bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); *offset += search_bytes; *bytes -= search_bytes; if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); if (!bitmap_info->bytes) free_bitmap(ctl, bitmap_info); /* * no entry after this bitmap, but we still have bytes to * remove, so something has gone wrong. */ if (!next) return -EINVAL; bitmap_info = rb_entry(next, struct btrfs_free_space, offset_index); /* * if the next entry isn't a bitmap we need to return to let the * extent stuff do its work. */ if (!bitmap_info->bitmap) return -EAGAIN; /* * Ok the next item is a bitmap, but it may not actually hold * the information for the rest of this free space stuff, so * look for it, and if we don't find it return so we can try * everything over again. */ search_start = *offset; search_bytes = ctl->unit; ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, false); if (ret < 0 || search_start != *offset) return -EAGAIN; goto again; } else if (!bitmap_info->bytes) free_bitmap(ctl, bitmap_info); return 0; } static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { u64 bytes_to_set = 0; u64 end; /* * This is a tradeoff to make bitmap trim state minimal. We mark the * whole bitmap untrimmed if at any point we add untrimmed regions. */ if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { if (btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR] += info->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; } info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; } end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); bytes_to_set = min(end - offset, bytes); btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); return bytes_to_set; } static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_block_group *block_group = ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; #ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(block_group)) forced = true; #endif /* This is a way to reclaim large regions from the bitmaps. */ if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) return false; /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ if (!forced && ctl->free_extents < ctl->extents_thresh) { /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want * to reserve them to larger extents, however if we have plenty * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ if (info->bytes <= fs_info->sectorsize * 8) { if (ctl->free_extents * 3 <= ctl->extents_thresh) return false; } else { return false; } } /* * The original block groups from mkfs can be really small, like 8 * megabytes, so don't bother with a bitmap for those entries. However * some block groups can be smaller than what a bitmap would cover but * are still large enough that they could overflow the 32k memory limit, * so allow those block groups to still be allowed to have a bitmap * entry. */ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) return false; return true; } static const struct btrfs_free_space_op free_space_op = { .use_bitmap = use_bitmap, }; static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; struct btrfs_block_group *block_group = NULL; int added = 0; u64 bytes, offset, bytes_added; enum btrfs_trim_state trim_state; int ret; bytes = info->bytes; offset = info->offset; trim_state = info->trim_state; if (!ctl->op->use_bitmap(ctl, info)) return 0; if (ctl->op == &free_space_op) block_group = ctl->block_group; again: /* * Since we link bitmaps right into the cluster we need to see if we * have a cluster here, and if so and it has our bitmap we need to add * the free space to that bitmap. */ if (block_group && !list_empty(&block_group->cluster_list)) { struct btrfs_free_cluster *cluster; struct rb_node *node; struct btrfs_free_space *entry; cluster = list_entry(block_group->cluster_list.next, struct btrfs_free_cluster, block_group_list); spin_lock(&cluster->lock); node = rb_first(&cluster->root); if (!node) { spin_unlock(&cluster->lock); goto no_cluster_bitmap; } entry = rb_entry(node, struct btrfs_free_space, offset_index); if (!entry->bitmap) { spin_unlock(&cluster->lock); goto no_cluster_bitmap; } if (entry->offset == offset_to_bitmap(ctl, offset)) { bytes_added = add_bytes_to_bitmap(ctl, entry, offset, bytes, trim_state); bytes -= bytes_added; offset += bytes_added; } spin_unlock(&cluster->lock); if (!bytes) { ret = 1; goto out; } } no_cluster_bitmap: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { ASSERT(added == 0); goto new_bitmap; } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, trim_state); bytes -= bytes_added; offset += bytes_added; added = 0; if (!bytes) { ret = 1; goto out; } else goto again; new_bitmap: if (info && info->bitmap) { add_new_bitmap(ctl, info, offset); added = 1; info = NULL; goto again; } else { spin_unlock(&ctl->tree_lock); /* no pre-allocated info, allocate a new one */ if (!info) { info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) { spin_lock(&ctl->tree_lock); ret = -ENOMEM; goto out; } } /* allocate the bitmap */ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); info->trim_state = BTRFS_TRIM_STATE_TRIMMED; spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; goto out; } goto again; } out: if (info) { if (info->bitmap) kmem_cache_free(btrfs_free_space_bitmap_cachep, info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } return ret; } /* * Free space merging rules: * 1) Merge trimmed areas together * 2) Let untrimmed areas coalesce with trimmed areas * 3) Always pull neighboring regions from bitmaps * * The above rules are for when we merge free space based on btrfs_trim_state. * Rules 2 and 3 are subtle because they are suboptimal, but are done for the * same reason: to promote larger extent regions which makes life easier for * find_free_extent(). Rule 2 enables coalescing based on the common path * being returning free space from btrfs_finish_extent_commit(). So when free * space is trimmed, it will prevent aggregating trimmed new region and * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents * and provide find_free_extent() with the largest extents possible hoping for * the reuse path. */ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *right_info; bool merged = false; u64 offset = info->offset; u64 bytes = info->bytes; const bool is_trimmed = btrfs_free_space_trimmed(info); struct rb_node *right_prev = NULL; /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ right_info = tree_search_offset(ctl, offset + bytes, 0, 0); if (right_info) right_prev = rb_prev(&right_info->offset_index); if (right_prev) left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); /* See try_merge_free_space() comment. */ if (right_info && !right_info->bitmap && (!is_trimmed || btrfs_free_space_trimmed(right_info))) { unlink_free_space(ctl, right_info, update_stat); info->bytes += right_info->bytes; kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; } /* See try_merge_free_space() comment. */ if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset && (!is_trimmed || btrfs_free_space_trimmed(left_info))) { unlink_free_space(ctl, left_info, update_stat); info->offset = left_info->offset; info->bytes += left_info->bytes; kmem_cache_free(btrfs_free_space_cachep, left_info); merged = true; } return merged; } static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { struct btrfs_free_space *bitmap; unsigned long i; unsigned long j; const u64 end = info->offset + info->bytes; const u64 bitmap_offset = offset_to_bitmap(ctl, end); u64 bytes; bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); if (!bitmap) return false; i = offset_to_bit(bitmap->offset, ctl->unit, end); j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); if (j == i) return false; bytes = (j - i) * ctl->unit; info->bytes += bytes; /* See try_merge_free_space() comment. */ if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); return true; } static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { struct btrfs_free_space *bitmap; u64 bitmap_offset; unsigned long i; unsigned long j; unsigned long prev_j; u64 bytes; bitmap_offset = offset_to_bitmap(ctl, info->offset); /* If we're on a boundary, try the previous logical bitmap. */ if (bitmap_offset == info->offset) { if (info->offset == 0) return false; bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); } bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); if (!bitmap) return false; i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; j = 0; prev_j = (unsigned long)-1; for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { if (j > i) break; prev_j = j; } if (prev_j == i) return false; if (prev_j == (unsigned long)-1) bytes = (i + 1) * ctl->unit; else bytes = (i - prev_j) * ctl->unit; info->offset -= bytes; info->bytes += bytes; /* See try_merge_free_space() comment. */ if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); return true; } /* * We prefer always to allocate from extent entries, both for clustered and * non-clustered allocation requests. So when attempting to add a new extent * entry, try to see if there's adjacent free space in bitmap entries, and if * there is, migrate that space from the bitmaps to the extent. * Like this we get better chances of satisfying space allocation requests * because we attempt to satisfy them based on a single cache entry, and never * on 2 or more entries - even if the entries represent a contiguous free space * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry * ends). */ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { /* * Only work with disconnected entries, as we can change their offset, * and must be extent entries. */ ASSERT(!info->bitmap); ASSERT(RB_EMPTY_NODE(&info->offset_index)); if (ctl->total_bitmaps > 0) { bool stole_end; bool stole_front = false; stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); if (ctl->total_bitmaps > 0) stole_front = steal_from_bitmap_to_front(ctl, info, update_stat); if (stole_end || stole_front) try_merge_free_space(ctl, info, update_stat); } } static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret = 0; u64 filter_bytes = bytes; ASSERT(!btrfs_is_zoned(fs_info)); info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; info->offset = offset; info->bytes = bytes; info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); RB_CLEAR_NODE(&info->bytes_index); spin_lock(&ctl->tree_lock); if (try_merge_free_space(ctl, info, true)) goto link; /* * There was no extent directly to the left or right of this new * extent then we know we're going to have to allocate a new extent, so * before we do that see if we need to drop this into a bitmap */ ret = insert_into_bitmap(ctl, info); if (ret < 0) { goto out; } else if (ret) { ret = 0; goto out; } link: /* * Only steal free space from adjacent bitmaps if we're sure we're not * going to add the new free space to existing bitmap entries - because * that would mean unnecessary work that would be reverted. Therefore * attempt to steal space from bitmaps if we're adding an extent entry. */ steal_from_bitmap(ctl, info, true); filter_bytes = max(filter_bytes, info->bytes); ret = link_free_space(ctl, info); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (ret) { btrfs_crit(fs_info, "unable to add free space :%d", ret); ASSERT(ret != -EEXIST); } if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { btrfs_discard_check_filter(block_group, filter_bytes); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); } return ret; } static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { struct btrfs_space_info *sinfo = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; int bg_reclaim_threshold = 0; bool initial = (size == block_group->length); u64 reclaimable_unusable; WARN_ON(!initial && offset + size > block_group->zone_capacity); if (!initial) bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); if (!used) to_free = size; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) to_free = size; else if (offset + size <= block_group->alloc_offset) to_free = 0; else to_free = offset + size - block_group->alloc_offset; to_unusable = size - to_free; ctl->free_space += to_free; /* * If the block group is read-only, we should account freed space into * bytes_readonly. */ if (!block_group->ro) block_group->zone_unusable += to_unusable; spin_unlock(&ctl->tree_lock); if (!used) { spin_lock(&block_group->lock); block_group->alloc_offset -= size; spin_unlock(&block_group->lock); } reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } return 0; } int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, true); if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, false); return btrfs_add_free_space(block_group, bytenr, size); } /* * This is a subtle distinction because when adding free space back in general, * we want it to be added as untrimmed for async. But in the case where we add * it on loading of a block group, we want to consider it trimmed. */ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, true); if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret; bool re_search = false; if (btrfs_is_zoned(block_group->fs_info)) { /* * This can happen with conventional zones when replaying log. * Since the allocation info of tree-log nodes are not recorded * to the extent-tree, calculate_alloc_pointer() failed to * advance the allocation pointer after last allocated tree log * node blocks. * * This function is called from * btrfs_pin_extent_for_log_replay() when replaying the log. * Advance the pointer not to overwrite the tree-log nodes. */ if (block_group->start + block_group->alloc_offset < offset + bytes) { block_group->alloc_offset = offset + bytes - block_group->start; } return 0; } spin_lock(&ctl->tree_lock); again: ret = 0; if (!bytes) goto out_lock; info = tree_search_offset(ctl, offset, 0, 0); if (!info) { /* * oops didn't find an extent that matched the space we wanted * to remove, look for a bitmap instead */ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { /* * If we found a partial bit of our free space in a * bitmap but then couldn't find the other part this may * be a problem, so WARN about it. */ WARN_ON(re_search); goto out_lock; } } re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info, true); if (offset == info->offset) { u64 to_free = min(bytes, info->bytes); info->bytes -= to_free; info->offset += to_free; if (info->bytes) { ret = link_free_space(ctl, info); WARN_ON(ret); } else { kmem_cache_free(btrfs_free_space_cachep, info); } offset += to_free; bytes -= to_free; goto again; } else { u64 old_end = info->bytes + info->offset; info->bytes = offset - info->offset; ret = link_free_space(ctl, info); WARN_ON(ret); if (ret) goto out_lock; /* Not enough bytes in this entry to satisfy us */ if (old_end < offset + bytes) { bytes -= old_end - offset; offset = old_end; goto again; } else if (old_end == offset + bytes) { /* all done */ goto out_lock; } spin_unlock(&ctl->tree_lock); ret = __btrfs_add_free_space(block_group, offset + bytes, old_end - (offset + bytes), info->trim_state); WARN_ON(ret); goto out; } } ret = remove_from_bitmap(ctl, info, &offset, &bytes); if (ret == -EAGAIN) { re_search = true; goto again; } out_lock: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); out: return ret; } void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; struct rb_node *n; int count = 0; /* * Zoned btrfs does not use free space tree and cluster. Just print * out the free space after the allocation offset. */ if (btrfs_is_zoned(fs_info)) { btrfs_info(fs_info, "free space %llu active %d", block_group->zone_capacity - block_group->alloc_offset, test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)); return; } spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) count++; btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, "%d free space entries at or bigger than %llu bytes", count, bytes); } void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl) { struct btrfs_fs_info *fs_info = block_group->fs_info; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; ctl->start = block_group->start; ctl->block_group = block_group; ctl->op = &free_space_op; ctl->free_space_bytes = RB_ROOT_CACHED; INIT_LIST_HEAD(&ctl->trimming_ranges); mutex_init(&ctl->cache_writeout_mutex); /* * we only want to have 32k of ram per block group for keeping * track of free space, and if we pass 1/2 of that we want to * start converting things over to using bitmaps */ ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); } /* * for a given cluster, put all of its extents back into the free * space cache. If the block group passed doesn't match the block group * pointed to by the cluster, someone else raced in and freed the * cluster already. In that case, we just return without changing anything */ static void __btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct rb_node *node; lockdep_assert_held(&ctl->tree_lock); spin_lock(&cluster->lock); if (cluster->block_group != block_group) { spin_unlock(&cluster->lock); return; } cluster->block_group = NULL; cluster->window_start = 0; list_del_init(&cluster->block_group_list); node = rb_first(&cluster->root); while (node) { struct btrfs_free_space *entry; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); RB_CLEAR_NODE(&entry->offset_index); if (!entry->bitmap) { /* Merging treats extents as if they were new */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; } try_merge_free_space(ctl, entry, false); steal_from_bitmap(ctl, entry, false); /* As we insert directly, update these statistics */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; } } tree_insert_offset(ctl, NULL, entry); rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, entry_less); } cluster->root = RB_ROOT; spin_unlock(&cluster->lock); btrfs_put_block_group(block_group); } void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_cluster *cluster; struct list_head *head; spin_lock(&ctl->tree_lock); while ((head = block_group->cluster_list.next) != &block_group->cluster_list) { cluster = list_entry(head, struct btrfs_free_cluster, block_group_list); WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache(ctl); btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); } /* * Walk @block_group's free space rb_tree to determine if everything is trimmed. */ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; struct rb_node *node; bool ret = true; spin_lock(&ctl->tree_lock); node = rb_first(&ctl->free_space_offset); while (node) { info = rb_entry(node, struct btrfs_free_space, offset_index); if (!btrfs_free_space_trimmed(info)) { ret = false; break; } node = rb_next(node); } spin_unlock(&ctl->tree_lock); return ret; } u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; u64 align_gap = 0; u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; bool use_bytes_index = (offset == block_group->start); ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, block_group->full_stripe_len, max_extent_size, use_bytes_index); if (!entry) goto out; ret = offset; if (entry->bitmap) { bitmap_clear_bits(ctl, entry, offset, bytes, true); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); if (!entry->bytes) free_bitmap(ctl, entry); } else { unlink_free_space(ctl, entry, true); align_gap_len = offset - entry->offset; align_gap = entry->offset; align_gap_trim_state = entry->trim_state; if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); entry->offset = offset + bytes; WARN_ON(entry->bytes < bytes + align_gap_len); entry->bytes -= bytes + align_gap_len; if (!entry->bytes) kmem_cache_free(btrfs_free_space_cachep, entry); else link_free_space(ctl, entry); } out: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (align_gap_len) __btrfs_add_free_space(block_group, align_gap, align_gap_len, align_gap_trim_state); return ret; } /* * given a cluster, put all of its extents back into the free space * cache. If a block group is passed, this function will only free * a cluster that belongs to the passed block group. * * Otherwise, it'll get a reference on the block group pointed to by the * cluster and remove the cluster from it. */ void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; /* first, get a safe pointer to the block group */ spin_lock(&cluster->lock); if (!block_group) { block_group = cluster->block_group; if (!block_group) { spin_unlock(&cluster->lock); return; } } else if (cluster->block_group != block_group) { /* someone else has already freed it don't redo their work */ spin_unlock(&cluster->lock); return; } btrfs_get_block_group(block_group); spin_unlock(&cluster->lock); ctl = block_group->free_space_ctl; /* now return any extents the cluster had on it */ spin_lock(&ctl->tree_lock); __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); /* finally drop our ref */ btrfs_put_block_group(block_group); } static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct btrfs_free_space *entry, u64 bytes, u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int err; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; search_start = min_start; search_bytes = bytes; err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); return 0; } ret = search_start; bitmap_clear_bits(ctl, entry, ret, bytes, false); return ret; } /* * given a cluster, try to allocate 'bytes' from it, returns 0 * if it couldn't find anything suitably large, or a logical disk offset * if things worked out */ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; struct rb_node *node; u64 ret = 0; ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; if (cluster->block_group != block_group) goto out; node = rb_first(&cluster->root); if (!node) goto out; entry = rb_entry(node, struct btrfs_free_space, offset_index); while (1) { if (entry->bytes < bytes) *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) { node = rb_next(&entry->offset_index); if (!node) break; entry = rb_entry(node, struct btrfs_free_space, offset_index); continue; } if (entry->bitmap) { ret = btrfs_alloc_from_bitmap(block_group, cluster, entry, bytes, cluster->window_start, max_extent_size); if (ret == 0) { node = rb_next(&entry->offset_index); if (!node) break; entry = rb_entry(node, struct btrfs_free_space, offset_index); continue; } cluster->window_start += bytes; } else { ret = entry->offset; entry->offset += bytes; entry->bytes -= bytes; } break; } out: spin_unlock(&cluster->lock); if (!ret) return 0; spin_lock(&ctl->tree_lock); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); ctl->free_space -= bytes; if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; spin_lock(&cluster->lock); if (entry->bytes == 0) { rb_erase(&entry->offset_index, &cluster->root); ctl->free_extents--; if (entry->bitmap) { kmem_cache_free(btrfs_free_space_bitmap_cachep, entry->bitmap); ctl->total_bitmaps--; recalculate_thresholds(ctl); } else if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; } kmem_cache_free(btrfs_free_space_cachep, entry); } spin_unlock(&cluster->lock); spin_unlock(&ctl->tree_lock); return ret; } static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; unsigned long want_bits; unsigned long min_bits; unsigned long found_bits; unsigned long max_bits = 0; unsigned long start = 0; unsigned long total_found = 0; int ret; lockdep_assert_held(&ctl->tree_lock); i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); want_bits = bytes_to_bits(bytes, ctl->unit); min_bits = bytes_to_bits(min_bytes, ctl->unit); /* * Don't bother looking for a cluster in this bitmap if it's heavily * fragmented. */ if (entry->max_extent_size && entry->max_extent_size < cont1_bytes) return -ENOSPC; again: found_bits = 0; for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { found_bits = next_zero - i; if (found_bits > max_bits) max_bits = found_bits; break; } if (next_zero - i > max_bits) max_bits = next_zero - i; i = next_zero; } if (!found_bits) { entry->max_extent_size = (u64)max_bits * ctl->unit; return -ENOSPC; } if (!total_found) { start = i; cluster->max_size = 0; } total_found += found_bits; if (cluster->max_size < found_bits * ctl->unit) cluster->max_size = found_bits * ctl->unit; if (total_found < want_bits || cluster->max_size < cont1_bytes) { i = next_zero + 1; goto again; } cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); /* * We need to know if we're currently on the normal space index when we * manipulate the bitmap so that we know we need to remove and re-insert * it into the space_index tree. Clear the bytes_index node here so the * bitmap manipulation helpers know not to mess with the space_index * until this bitmap entry is added back into the normal cache. */ RB_CLEAR_NODE(&entry->bytes_index); ret = tree_insert_offset(ctl, cluster, entry); ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, total_found * ctl->unit, 1); return 0; } /* * This searches the block group for just extents to fill the cluster with. * Try to find a cluster with at least bytes total bytes, at least one * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_free; u64 max_extent; u64 total_size = 0; lockdep_assert_held(&ctl->tree_lock); entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) return -ENOSPC; /* * We don't want bitmaps, so just move along until we find a normal * extent entry. */ while (entry->bitmap || entry->bytes < min_bytes) { if (entry->bitmap && list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); } window_free = entry->bytes; max_extent = entry->bytes; first = entry; last = entry; for (node = rb_next(&entry->offset_index); node; node = rb_next(&entry->offset_index)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); continue; } if (entry->bytes < min_bytes) continue; last = entry; window_free += entry->bytes; if (entry->bytes > max_extent) max_extent = entry->bytes; } if (window_free < bytes || max_extent < cont1_bytes) return -ENOSPC; cluster->window_start = first->offset; node = &first->offset_index; /* * now we've found our entries, pull them out of the free space * cache and put them into the cluster rbtree */ do { int ret; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); if (entry->bitmap || entry->bytes < min_bytes) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); ret = tree_insert_offset(ctl, cluster, entry); total_size += entry->bytes; ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); cluster->max_size = max_extent; trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); return 0; } /* * This specifically looks for bitmaps that may work in the cluster, we assume * that we have already failed to find extents that will work. */ static noinline int setup_cluster_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry = NULL; int ret = -ENOSPC; u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* * The bitmap that covers offset won't be in the list unless offset * is just its start offset. */ if (!list_empty(bitmaps)) entry = list_first_entry(bitmaps, struct btrfs_free_space, list); if (!entry || entry->offset != bitmap_offset) { entry = tree_search_offset(ctl, bitmap_offset, 1, 0); if (entry && list_empty(&entry->list)) list_add(&entry->list, bitmaps); } list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, bytes, cont1_bytes, min_bytes); if (!ret) return 0; } /* * The bitmaps list has all the bitmaps that record free space * starting after offset, so no more search is required. */ return -ENOSPC; } /* * here we try to find a cluster of blocks in a block group. The goal * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; u64 cont1_bytes; int ret; /* * Choose the minimum extent size we'll require for this * cluster. For SSD_SPREAD, don't allow any fragmentation. * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { cont1_bytes = bytes + empty_size; min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; } else { cont1_bytes = max(bytes, (bytes + empty_size) >> 2); min_bytes = fs_info->sectorsize; } spin_lock(&ctl->tree_lock); /* * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ if (ctl->free_space < bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } spin_lock(&cluster->lock); /* someone already found a cluster, hooray */ if (cluster->block_group) { ret = 0; goto out; } trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, min_bytes); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) list_del_init(&entry->list); if (!ret) { btrfs_get_block_group(block_group); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; } else { trace_btrfs_failed_cluster_setup(block_group); } out: spin_unlock(&cluster->lock); spin_unlock(&ctl->tree_lock); return ret; } /* * simple code to zero out a cluster */ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) { spin_lock_init(&cluster->lock); spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; cluster->fragmented = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, enum btrfs_trim_state reserved_trim_state, struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; const u64 end = start + bytes; const u64 reserved_end = reserved_start + reserved_bytes; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; u64 trimmed = 0; spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (!block_group->ro) { block_group->reserved += reserved_bytes; space_info->bytes_reserved += reserved_bytes; update = 1; } spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); if (!ret) { *total_trimmed += trimmed; trim_state = BTRFS_TRIM_STATE_TRIMMED; } mutex_lock(&ctl->cache_writeout_mutex); if (reserved_start < start) __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); if (end < reserved_end) __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); __btrfs_add_free_space(block_group, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); if (update) { spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (block_group->ro) space_info->bytes_readonly += reserved_bytes; block_group->reserved -= reserved_bytes; space_info->bytes_reserved -= reserved_bytes; spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); } return ret; } /* * If @async is set, then we will trim 1 region and return. */ static int trim_no_bitmap(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen, bool async) { struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; struct rb_node *node; int ret = 0; u64 extent_start; u64 extent_bytes; enum btrfs_trim_state extent_trim_state; u64 bytes; const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (start < end) { struct btrfs_trim_range trim_entry; mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) goto out_unlock; entry = tree_search_offset(ctl, start, 0, 1); if (!entry) goto out_unlock; /* Skip bitmaps and if async, already trimmed entries */ while (entry->bitmap || (async && btrfs_free_space_trimmed(entry))) { node = rb_next(&entry->offset_index); if (!node) goto out_unlock; entry = rb_entry(node, struct btrfs_free_space, offset_index); } if (entry->offset >= end) goto out_unlock; extent_start = entry->offset; extent_bytes = entry->bytes; extent_trim_state = entry->trim_state; if (async) { start = entry->offset; bytes = entry->bytes; if (bytes < minlen) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } unlink_free_space(ctl, entry, true); /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim * X when we come back around. So trim it now. */ if (max_discard_size && bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER)) { bytes = max_discard_size; extent_bytes = max_discard_size; entry->offset += max_discard_size; entry->bytes -= max_discard_size; link_free_space(ctl, entry); } else { kmem_cache_free(btrfs_free_space_cachep, entry); } } else { start = max(start, extent_start); bytes = min(extent_start + extent_bytes, end) - start; if (bytes < minlen) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } unlink_free_space(ctl, entry, true); kmem_cache_free(btrfs_free_space_cachep, entry); } spin_unlock(&ctl->tree_lock); trim_entry.start = extent_start; trim_entry.bytes = extent_bytes; list_add_tail(&trim_entry.list, &ctl->trimming_ranges); mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, extent_start, extent_bytes, extent_trim_state, &trim_entry); if (ret) { block_group->discard_cursor = start + bytes; break; } next: start += bytes; block_group->discard_cursor = start; if (async && *total_trimmed) break; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; break; } cond_resched(); } return ret; out_unlock: block_group->discard_cursor = btrfs_block_group_end(block_group); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); return ret; } /* * If we break out of trimming a bitmap prematurely, we should reset the * trimming bit. In a rather contrieved case, it's possible to race here so * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. * * start = start of bitmap * end = near end of bitmap * * Thread 1: Thread 2: * trim_bitmaps(start) * trim_bitmaps(end) * end_trimming_bitmap() * reset_trimming_bitmap() */ static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { struct btrfs_free_space *entry; spin_lock(&ctl->tree_lock); entry = tree_search_offset(ctl, offset, 1, 0); if (entry) { if (btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR] += entry->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; } entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; } spin_unlock(&ctl->tree_lock); } static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *entry) { if (btrfs_free_space_trimming_bitmap(entry)) { entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; ctl->discardable_extents[BTRFS_STAT_CURR] -= entry->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; } } /* * If @async is set, then we will trim 1 region and return. */ static int trim_bitmaps(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async) { struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; int ret = 0; int ret2; u64 bytes; u64 offset = offset_to_bitmap(ctl, start); const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (offset < end) { bool next_bitmap = false; struct btrfs_trim_range trim_entry; mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { block_group->discard_cursor = btrfs_block_group_end(block_group); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); /* * Bitmaps are marked trimmed lossily now to prevent constant * discarding of the same bitmap (the reason why we are bound * by the filters). So, retrim the block group bitmaps when we * are preparing to punt to the unused_bgs list. This uses * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED * which is the only discard index which sets minlen to 0. */ if (!entry || (async && minlen && start == offset && btrfs_free_space_trimmed(entry))) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } /* * Async discard bitmap trimming begins at by setting the start * to be key.objectid and the offset_to_bitmap() aligns to the * start of the bitmap. This lets us know we are fully * scanning the bitmap rather than only some portion of it. */ if (start == offset) entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; bytes = minlen; ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { /* * We lossily consider a bitmap trimmed if we only skip * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. */ if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) end_trimming_bitmap(ctl, entry); else entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } /* * We already trimmed a region, but are using the locking above * to reset the trim_state. */ if (async && *total_trimmed) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto out; } bytes = min(bytes, end - start); if (bytes < minlen || (async && maxlen && bytes > maxlen)) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < @minlen, we won't trim X when we come back around. * So trim it now. We differ here from trimming extents as we * don't keep individual state per bit. */ if (async && max_discard_size && bytes > (max_discard_size + minlen)) bytes = max_discard_size; bitmap_clear_bits(ctl, entry, start, bytes, true); if (entry->bytes == 0) free_bitmap(ctl, entry); spin_unlock(&ctl->tree_lock); trim_entry.start = start; trim_entry.bytes = bytes; list_add_tail(&trim_entry.list, &ctl->trimming_ranges); mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, start, bytes, 0, &trim_entry); if (ret) { reset_trimming_bitmap(ctl, offset); block_group->discard_cursor = btrfs_block_group_end(block_group); break; } next: if (next_bitmap) { offset += BITS_PER_BITMAP * ctl->unit; start = offset; } else { start += bytes; } block_group->discard_cursor = start; if (fatal_signal_pending(current)) { if (start != offset) reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; break; } cond_resched(); } if (offset >= end) block_group->discard_cursor = end; out: return ret; } int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; u64 rem = 0; ASSERT(!btrfs_is_zoned(block_group->fs_info)); *trimmed = 0; spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); if (ret) goto out; ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); /* If we ended in the middle of a bitmap, reset the trimming flag */ if (rem) reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); out: btrfs_unfreeze_block_group(block_group); return ret; } int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, bool async) { int ret; *trimmed = 0; spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); btrfs_unfreeze_block_group(block_group); return ret; } int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async) { int ret; *trimmed = 0; spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, async); btrfs_unfreeze_block_group(block_group); return ret; } bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) { return btrfs_super_cache_generation(fs_info->super_copy); } static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans) { struct btrfs_block_group *block_group; struct rb_node *node; int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); if (ret) goto out; node = rb_next(node); } out: return ret; } int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) { struct btrfs_trans_handle *trans; int ret; /* * update_super_roots will appropriately set or unset * super_copy->cache_generation based on SPACE_CACHE and * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a * transaction commit whether we are enabling space cache v1 and don't * have any other work to do, or are disabling it and removing free * space inodes. */ trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); if (!active) { set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); ret = cleanup_free_space_cache_v1(fs_info, trans); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } } ret = btrfs_commit_transaction(trans); out: clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); return ret; } int __init btrfs_free_space_init(void) { btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); if (!btrfs_free_space_cachep) return -ENOMEM; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, 0, NULL); if (!btrfs_free_space_bitmap_cachep) { kmem_cache_destroy(btrfs_free_space_cachep); return -ENOMEM; } return 0; } void __cold btrfs_free_space_exit(void) { kmem_cache_destroy(btrfs_free_space_cachep); kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it * doesn't do any of the merging that add_free_space does, this acts a lot like * how the free space cache loading stuff works, so you can get really weird * configurations. */ int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info = NULL, *bitmap_info; void *map = NULL; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; u64 bytes_added; int ret; again: if (!info) { info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; } if (!bitmap) { spin_lock(&ctl->tree_lock); info->offset = offset; info->bytes = bytes; info->max_extent_size = 0; ret = link_free_space(ctl, info); spin_unlock(&ctl->tree_lock); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); return ret; } if (!map) { map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!map) { kmem_cache_free(btrfs_free_space_cachep, info); return -ENOMEM; } } spin_lock(&ctl->tree_lock); bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { info->bitmap = map; map = NULL; add_new_bitmap(ctl, info, offset); bitmap_info = info; info = NULL; } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, trim_state); bytes -= bytes_added; offset += bytes_added; spin_unlock(&ctl->tree_lock); if (bytes) goto again; if (info) kmem_cache_free(btrfs_free_space_cachep, info); if (map) kmem_cache_free(btrfs_free_space_bitmap_cachep, map); return 0; } /* * Checks to see if the given range is in the free space cache. This is really * just used to check the absence of space, so if there is free space in the * range at all we will return 1. */ int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info; int ret = 0; spin_lock(&ctl->tree_lock); info = tree_search_offset(ctl, offset, 0, 0); if (!info) { info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) goto out; } have_info: if (info->bitmap) { u64 bit_off, bit_bytes; struct rb_node *n; struct btrfs_free_space *tmp; bit_off = offset; bit_bytes = ctl->unit; ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); if (!ret) { if (bit_off == offset) { ret = 1; goto out; } else if (bit_off > offset && offset + bytes > bit_off) { ret = 1; goto out; } } n = rb_prev(&info->offset_index); while (n) { tmp = rb_entry(n, struct btrfs_free_space, offset_index); if (tmp->offset + tmp->bytes < offset) break; if (offset + bytes < tmp->offset) { n = rb_prev(&tmp->offset_index); continue; } info = tmp; goto have_info; } n = rb_next(&info->offset_index); while (n) { tmp = rb_entry(n, struct btrfs_free_space, offset_index); if (offset + bytes < tmp->offset) break; if (tmp->offset + tmp->bytes < offset) { n = rb_next(&tmp->offset_index); continue; } info = tmp; goto have_info; } ret = 0; goto out; } if (info->offset == offset) { ret = 1; goto out; } if (offset > info->offset && offset < info->offset + info->bytes) ret = 1; out: spin_unlock(&ctl->tree_lock); return ret; } #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ |
17 17 17 17 1289 111 2 1 8 1 8 8 20 19 2 2 419 1 420 420 22 2 1 19 20 15 2 1 12 13 56 1 56 56 1223 1 1220 9 1 1 7 8 7 8 8 8 23 5 18 19 17 17 1 9 17 17 16 17 17 17 17 17 16 17 17 17 1 16 17 17 8 9 9 9 9 9 8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 17 2 2 2 2 15 1 1 1 1 6 11 10 11 11 11 11 11 11 11 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 8 8 17 4 4 4 14 14 14 17 17 9 17 9 8 17 17 17 17 17 17 17 17 17 17 17 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Video capture interface for Linux version 2 * * A generic video device interface for the LINUX operating system * using a set of device structures/vectors for low level operations. * * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> (version 1) * Mauro Carvalho Chehab <mchehab@kernel.org> (version 2) * * Fixes: 20000516 Claudio Matsuoka <claudio@conectiva.com> * - Added procfs support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/debugfs.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <media/v4l2-common.h> #include <media/v4l2-device.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-event.h> #define VIDEO_NUM_DEVICES 256 #define VIDEO_NAME "video4linux" #define dprintk(fmt, arg...) do { \ printk(KERN_DEBUG pr_fmt("%s: " fmt), \ __func__, ##arg); \ } while (0) /* * sysfs stuff */ static ssize_t index_show(struct device *cd, struct device_attribute *attr, char *buf) { struct video_device *vdev = to_video_device(cd); return sprintf(buf, "%i\n", vdev->index); } static DEVICE_ATTR_RO(index); static ssize_t dev_debug_show(struct device *cd, struct device_attribute *attr, char *buf) { struct video_device *vdev = to_video_device(cd); return sprintf(buf, "%i\n", vdev->dev_debug); } static ssize_t dev_debug_store(struct device *cd, struct device_attribute *attr, const char *buf, size_t len) { struct video_device *vdev = to_video_device(cd); int res = 0; u16 value; res = kstrtou16(buf, 0, &value); if (res) return res; vdev->dev_debug = value; return len; } static DEVICE_ATTR_RW(dev_debug); static ssize_t name_show(struct device *cd, struct device_attribute *attr, char *buf) { struct video_device *vdev = to_video_device(cd); return sprintf(buf, "%.*s\n", (int)sizeof(vdev->name), vdev->name); } static DEVICE_ATTR_RO(name); static struct attribute *video_device_attrs[] = { &dev_attr_name.attr, &dev_attr_dev_debug.attr, &dev_attr_index.attr, NULL, }; ATTRIBUTE_GROUPS(video_device); /* * Active devices */ static struct video_device *video_devices[VIDEO_NUM_DEVICES]; static DEFINE_MUTEX(videodev_lock); static DECLARE_BITMAP(devnode_nums[VFL_TYPE_MAX], VIDEO_NUM_DEVICES); /* Device node utility functions */ /* Note: these utility functions all assume that vfl_type is in the range [0, VFL_TYPE_MAX-1]. */ #ifdef CONFIG_VIDEO_FIXED_MINOR_RANGES /* Return the bitmap corresponding to vfl_type. */ static inline unsigned long *devnode_bits(enum vfl_devnode_type vfl_type) { /* Any types not assigned to fixed minor ranges must be mapped to one single bitmap for the purposes of finding a free node number since all those unassigned types use the same minor range. */ int idx = (vfl_type > VFL_TYPE_RADIO) ? VFL_TYPE_MAX - 1 : vfl_type; return devnode_nums[idx]; } #else /* Return the bitmap corresponding to vfl_type. */ static inline unsigned long *devnode_bits(enum vfl_devnode_type vfl_type) { return devnode_nums[vfl_type]; } #endif /* Mark device node number vdev->num as used */ static inline void devnode_set(struct video_device *vdev) { set_bit(vdev->num, devnode_bits(vdev->vfl_type)); } /* Mark device node number vdev->num as unused */ static inline void devnode_clear(struct video_device *vdev) { clear_bit(vdev->num, devnode_bits(vdev->vfl_type)); } /* Try to find a free device node number in the range [from, to> */ static inline int devnode_find(struct video_device *vdev, int from, int to) { return find_next_zero_bit(devnode_bits(vdev->vfl_type), to, from); } struct video_device *video_device_alloc(void) { return kzalloc(sizeof(struct video_device), GFP_KERNEL); } EXPORT_SYMBOL(video_device_alloc); void video_device_release(struct video_device *vdev) { kfree(vdev); } EXPORT_SYMBOL(video_device_release); void video_device_release_empty(struct video_device *vdev) { /* Do nothing */ /* Only valid when the video_device struct is a static. */ } EXPORT_SYMBOL(video_device_release_empty); static inline void video_get(struct video_device *vdev) { get_device(&vdev->dev); } static inline void video_put(struct video_device *vdev) { put_device(&vdev->dev); } /* Called when the last user of the video device exits. */ static void v4l2_device_release(struct device *cd) { struct video_device *vdev = to_video_device(cd); struct v4l2_device *v4l2_dev = vdev->v4l2_dev; mutex_lock(&videodev_lock); if (WARN_ON(video_devices[vdev->minor] != vdev)) { /* should not happen */ mutex_unlock(&videodev_lock); return; } /* Free up this device for reuse */ video_devices[vdev->minor] = NULL; /* Delete the cdev on this minor as well */ cdev_del(vdev->cdev); /* Just in case some driver tries to access this from the release() callback. */ vdev->cdev = NULL; /* Mark device node number as free */ devnode_clear(vdev); mutex_unlock(&videodev_lock); #if defined(CONFIG_MEDIA_CONTROLLER) if (v4l2_dev->mdev && vdev->vfl_dir != VFL_DIR_M2M) { /* Remove interfaces and interface links */ media_devnode_remove(vdev->intf_devnode); if (vdev->entity.function != MEDIA_ENT_F_UNKNOWN) media_device_unregister_entity(&vdev->entity); } #endif /* Do not call v4l2_device_put if there is no release callback set. * Drivers that have no v4l2_device release callback might free the * v4l2_dev instance in the video_device release callback below, so we * must perform this check here. * * TODO: In the long run all drivers that use v4l2_device should use the * v4l2_device release callback. This check will then be unnecessary. */ if (v4l2_dev->release == NULL) v4l2_dev = NULL; /* Release video_device and perform other cleanups as needed. */ vdev->release(vdev); /* Decrease v4l2_device refcount */ if (v4l2_dev) v4l2_device_put(v4l2_dev); } static struct class video_class = { .name = VIDEO_NAME, .dev_groups = video_device_groups, }; struct video_device *video_devdata(struct file *file) { return video_devices[iminor(file_inode(file))]; } EXPORT_SYMBOL(video_devdata); /* Priority handling */ static inline bool prio_is_valid(enum v4l2_priority prio) { return prio == V4L2_PRIORITY_BACKGROUND || prio == V4L2_PRIORITY_INTERACTIVE || prio == V4L2_PRIORITY_RECORD; } void v4l2_prio_init(struct v4l2_prio_state *global) { memset(global, 0, sizeof(*global)); } EXPORT_SYMBOL(v4l2_prio_init); int v4l2_prio_change(struct v4l2_prio_state *global, enum v4l2_priority *local, enum v4l2_priority new) { if (!prio_is_valid(new)) return -EINVAL; if (*local == new) return 0; atomic_inc(&global->prios[new]); if (prio_is_valid(*local)) atomic_dec(&global->prios[*local]); *local = new; return 0; } EXPORT_SYMBOL(v4l2_prio_change); void v4l2_prio_open(struct v4l2_prio_state *global, enum v4l2_priority *local) { v4l2_prio_change(global, local, V4L2_PRIORITY_DEFAULT); } EXPORT_SYMBOL(v4l2_prio_open); void v4l2_prio_close(struct v4l2_prio_state *global, enum v4l2_priority local) { if (prio_is_valid(local)) atomic_dec(&global->prios[local]); } EXPORT_SYMBOL(v4l2_prio_close); enum v4l2_priority v4l2_prio_max(struct v4l2_prio_state *global) { if (atomic_read(&global->prios[V4L2_PRIORITY_RECORD]) > 0) return V4L2_PRIORITY_RECORD; if (atomic_read(&global->prios[V4L2_PRIORITY_INTERACTIVE]) > 0) return V4L2_PRIORITY_INTERACTIVE; if (atomic_read(&global->prios[V4L2_PRIORITY_BACKGROUND]) > 0) return V4L2_PRIORITY_BACKGROUND; return V4L2_PRIORITY_UNSET; } EXPORT_SYMBOL(v4l2_prio_max); int v4l2_prio_check(struct v4l2_prio_state *global, enum v4l2_priority local) { return (local < v4l2_prio_max(global)) ? -EBUSY : 0; } EXPORT_SYMBOL(v4l2_prio_check); static ssize_t v4l2_read(struct file *filp, char __user *buf, size_t sz, loff_t *off) { struct video_device *vdev = video_devdata(filp); int ret = -ENODEV; if (!vdev->fops->read) return -EINVAL; if (video_is_registered(vdev)) ret = vdev->fops->read(filp, buf, sz, off); if ((vdev->dev_debug & V4L2_DEV_DEBUG_FOP) && (vdev->dev_debug & V4L2_DEV_DEBUG_STREAMING)) dprintk("%s: read: %zd (%d)\n", video_device_node_name(vdev), sz, ret); return ret; } static ssize_t v4l2_write(struct file *filp, const char __user *buf, size_t sz, loff_t *off) { struct video_device *vdev = video_devdata(filp); int ret = -ENODEV; if (!vdev->fops->write) return -EINVAL; if (video_is_registered(vdev)) ret = vdev->fops->write(filp, buf, sz, off); if ((vdev->dev_debug & V4L2_DEV_DEBUG_FOP) && (vdev->dev_debug & V4L2_DEV_DEBUG_STREAMING)) dprintk("%s: write: %zd (%d)\n", video_device_node_name(vdev), sz, ret); return ret; } static __poll_t v4l2_poll(struct file *filp, struct poll_table_struct *poll) { struct video_device *vdev = video_devdata(filp); __poll_t res = EPOLLERR | EPOLLHUP | EPOLLPRI; if (video_is_registered(vdev)) { if (!vdev->fops->poll) res = DEFAULT_POLLMASK; else res = vdev->fops->poll(filp, poll); } if (vdev->dev_debug & V4L2_DEV_DEBUG_POLL) dprintk("%s: poll: %08x %08x\n", video_device_node_name(vdev), res, poll_requested_events(poll)); return res; } static long v4l2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct video_device *vdev = video_devdata(filp); int ret = -ENODEV; if (vdev->fops->unlocked_ioctl) { if (video_is_registered(vdev)) ret = vdev->fops->unlocked_ioctl(filp, cmd, arg); } else ret = -ENOTTY; return ret; } #ifdef CONFIG_MMU #define v4l2_get_unmapped_area NULL #else static unsigned long v4l2_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct video_device *vdev = video_devdata(filp); int ret; if (!vdev->fops->get_unmapped_area) return -ENOSYS; if (!video_is_registered(vdev)) return -ENODEV; ret = vdev->fops->get_unmapped_area(filp, addr, len, pgoff, flags); if (vdev->dev_debug & V4L2_DEV_DEBUG_FOP) dprintk("%s: get_unmapped_area (%d)\n", video_device_node_name(vdev), ret); return ret; } #endif static int v4l2_mmap(struct file *filp, struct vm_area_struct *vm) { struct video_device *vdev = video_devdata(filp); int ret = -ENODEV; if (!vdev->fops->mmap) return -ENODEV; if (video_is_registered(vdev)) ret = vdev->fops->mmap(filp, vm); if (vdev->dev_debug & V4L2_DEV_DEBUG_FOP) dprintk("%s: mmap (%d)\n", video_device_node_name(vdev), ret); return ret; } /* Override for the open function */ static int v4l2_open(struct inode *inode, struct file *filp) { struct video_device *vdev; int ret = 0; /* Check if the video device is available */ mutex_lock(&videodev_lock); vdev = video_devdata(filp); /* return ENODEV if the video device has already been removed. */ if (vdev == NULL || !video_is_registered(vdev)) { mutex_unlock(&videodev_lock); return -ENODEV; } /* and increase the device refcount */ video_get(vdev); mutex_unlock(&videodev_lock); if (vdev->fops->open) { if (video_is_registered(vdev)) ret = vdev->fops->open(filp); else ret = -ENODEV; } if (vdev->dev_debug & V4L2_DEV_DEBUG_FOP) dprintk("%s: open (%d)\n", video_device_node_name(vdev), ret); /* decrease the refcount in case of an error */ if (ret) video_put(vdev); return ret; } /* Override for the release function */ static int v4l2_release(struct inode *inode, struct file *filp) { struct video_device *vdev = video_devdata(filp); int ret = 0; /* * We need to serialize the release() with queueing new requests. * The release() may trigger the cancellation of a streaming * operation, and that should not be mixed with queueing a new * request at the same time. */ if (vdev->fops->release) { if (v4l2_device_supports_requests(vdev->v4l2_dev)) { mutex_lock(&vdev->v4l2_dev->mdev->req_queue_mutex); ret = vdev->fops->release(filp); mutex_unlock(&vdev->v4l2_dev->mdev->req_queue_mutex); } else { ret = vdev->fops->release(filp); } } if (vdev->dev_debug & V4L2_DEV_DEBUG_FOP) dprintk("%s: release\n", video_device_node_name(vdev)); /* decrease the refcount unconditionally since the release() return value is ignored. */ video_put(vdev); return ret; } static const struct file_operations v4l2_fops = { .owner = THIS_MODULE, .read = v4l2_read, .write = v4l2_write, .open = v4l2_open, .get_unmapped_area = v4l2_get_unmapped_area, .mmap = v4l2_mmap, .unlocked_ioctl = v4l2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = v4l2_compat_ioctl32, #endif .release = v4l2_release, .poll = v4l2_poll, .llseek = no_llseek, }; /** * get_index - assign stream index number based on v4l2_dev * @vdev: video_device to assign index number to, vdev->v4l2_dev should be assigned * * Note that when this is called the new device has not yet been registered * in the video_device array, but it was able to obtain a minor number. * * This means that we can always obtain a free stream index number since * the worst case scenario is that there are VIDEO_NUM_DEVICES - 1 slots in * use of the video_device array. * * Returns a free index number. */ static int get_index(struct video_device *vdev) { /* This can be static since this function is called with the global videodev_lock held. */ static DECLARE_BITMAP(used, VIDEO_NUM_DEVICES); int i; bitmap_zero(used, VIDEO_NUM_DEVICES); for (i = 0; i < VIDEO_NUM_DEVICES; i++) { if (video_devices[i] != NULL && video_devices[i]->v4l2_dev == vdev->v4l2_dev) { __set_bit(video_devices[i]->index, used); } } return find_first_zero_bit(used, VIDEO_NUM_DEVICES); } #define SET_VALID_IOCTL(ops, cmd, op) \ do { if ((ops)->op) __set_bit(_IOC_NR(cmd), valid_ioctls); } while (0) /* This determines which ioctls are actually implemented in the driver. It's a one-time thing which simplifies video_ioctl2 as it can just do a bit test. Note that drivers can override this by setting bits to 1 in vdev->valid_ioctls. If an ioctl is marked as 1 when this function is called, then that ioctl will actually be marked as unimplemented. It does that by first setting up the local valid_ioctls bitmap, and at the end do a: vdev->valid_ioctls = valid_ioctls & ~(vdev->valid_ioctls) */ static void determine_valid_ioctls(struct video_device *vdev) { const u32 vid_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_CAPTURE_MPLANE | V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_VIDEO_OUTPUT_MPLANE | V4L2_CAP_VIDEO_M2M | V4L2_CAP_VIDEO_M2M_MPLANE; const u32 meta_caps = V4L2_CAP_META_CAPTURE | V4L2_CAP_META_OUTPUT; DECLARE_BITMAP(valid_ioctls, BASE_VIDIOC_PRIVATE); const struct v4l2_ioctl_ops *ops = vdev->ioctl_ops; bool is_vid = vdev->vfl_type == VFL_TYPE_VIDEO && (vdev->device_caps & vid_caps); bool is_vbi = vdev->vfl_type == VFL_TYPE_VBI; bool is_radio = vdev->vfl_type == VFL_TYPE_RADIO; bool is_sdr = vdev->vfl_type == VFL_TYPE_SDR; bool is_tch = vdev->vfl_type == VFL_TYPE_TOUCH; bool is_meta = vdev->vfl_type == VFL_TYPE_VIDEO && (vdev->device_caps & meta_caps); bool is_rx = vdev->vfl_dir != VFL_DIR_TX; bool is_tx = vdev->vfl_dir != VFL_DIR_RX; bool is_io_mc = vdev->device_caps & V4L2_CAP_IO_MC; bool has_streaming = vdev->device_caps & V4L2_CAP_STREAMING; bitmap_zero(valid_ioctls, BASE_VIDIOC_PRIVATE); /* vfl_type and vfl_dir independent ioctls */ SET_VALID_IOCTL(ops, VIDIOC_QUERYCAP, vidioc_querycap); __set_bit(_IOC_NR(VIDIOC_G_PRIORITY), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_S_PRIORITY), valid_ioctls); /* Note: the control handler can also be passed through the filehandle, and that can't be tested here. If the bit for these control ioctls is set, then the ioctl is valid. But if it is 0, then it can still be valid if the filehandle passed the control handler. */ if (vdev->ctrl_handler || ops->vidioc_queryctrl) __set_bit(_IOC_NR(VIDIOC_QUERYCTRL), valid_ioctls); if (vdev->ctrl_handler || ops->vidioc_query_ext_ctrl) __set_bit(_IOC_NR(VIDIOC_QUERY_EXT_CTRL), valid_ioctls); if (vdev->ctrl_handler || ops->vidioc_g_ctrl || ops->vidioc_g_ext_ctrls) __set_bit(_IOC_NR(VIDIOC_G_CTRL), valid_ioctls); if (vdev->ctrl_handler || ops->vidioc_s_ctrl || ops->vidioc_s_ext_ctrls) __set_bit(_IOC_NR(VIDIOC_S_CTRL), valid_ioctls); if (vdev->ctrl_handler || ops->vidioc_g_ext_ctrls) __set_bit(_IOC_NR(VIDIOC_G_EXT_CTRLS), valid_ioctls); if (vdev->ctrl_handler || ops->vidioc_s_ext_ctrls) __set_bit(_IOC_NR(VIDIOC_S_EXT_CTRLS), valid_ioctls); if (vdev->ctrl_handler || ops->vidioc_try_ext_ctrls) __set_bit(_IOC_NR(VIDIOC_TRY_EXT_CTRLS), valid_ioctls); if (vdev->ctrl_handler || ops->vidioc_querymenu) __set_bit(_IOC_NR(VIDIOC_QUERYMENU), valid_ioctls); if (!is_tch) { SET_VALID_IOCTL(ops, VIDIOC_G_FREQUENCY, vidioc_g_frequency); SET_VALID_IOCTL(ops, VIDIOC_S_FREQUENCY, vidioc_s_frequency); } SET_VALID_IOCTL(ops, VIDIOC_LOG_STATUS, vidioc_log_status); #ifdef CONFIG_VIDEO_ADV_DEBUG __set_bit(_IOC_NR(VIDIOC_DBG_G_CHIP_INFO), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_DBG_G_REGISTER), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_DBG_S_REGISTER), valid_ioctls); #endif /* yes, really vidioc_subscribe_event */ SET_VALID_IOCTL(ops, VIDIOC_DQEVENT, vidioc_subscribe_event); SET_VALID_IOCTL(ops, VIDIOC_SUBSCRIBE_EVENT, vidioc_subscribe_event); SET_VALID_IOCTL(ops, VIDIOC_UNSUBSCRIBE_EVENT, vidioc_unsubscribe_event); if (ops->vidioc_enum_freq_bands || ops->vidioc_g_tuner || ops->vidioc_g_modulator) __set_bit(_IOC_NR(VIDIOC_ENUM_FREQ_BANDS), valid_ioctls); if (is_vid) { /* video specific ioctls */ if ((is_rx && (ops->vidioc_enum_fmt_vid_cap || ops->vidioc_enum_fmt_vid_overlay)) || (is_tx && ops->vidioc_enum_fmt_vid_out)) __set_bit(_IOC_NR(VIDIOC_ENUM_FMT), valid_ioctls); if ((is_rx && (ops->vidioc_g_fmt_vid_cap || ops->vidioc_g_fmt_vid_cap_mplane || ops->vidioc_g_fmt_vid_overlay)) || (is_tx && (ops->vidioc_g_fmt_vid_out || ops->vidioc_g_fmt_vid_out_mplane || ops->vidioc_g_fmt_vid_out_overlay))) __set_bit(_IOC_NR(VIDIOC_G_FMT), valid_ioctls); if ((is_rx && (ops->vidioc_s_fmt_vid_cap || ops->vidioc_s_fmt_vid_cap_mplane || ops->vidioc_s_fmt_vid_overlay)) || (is_tx && (ops->vidioc_s_fmt_vid_out || ops->vidioc_s_fmt_vid_out_mplane || ops->vidioc_s_fmt_vid_out_overlay))) __set_bit(_IOC_NR(VIDIOC_S_FMT), valid_ioctls); if ((is_rx && (ops->vidioc_try_fmt_vid_cap || ops->vidioc_try_fmt_vid_cap_mplane || ops->vidioc_try_fmt_vid_overlay)) || (is_tx && (ops->vidioc_try_fmt_vid_out || ops->vidioc_try_fmt_vid_out_mplane || ops->vidioc_try_fmt_vid_out_overlay))) __set_bit(_IOC_NR(VIDIOC_TRY_FMT), valid_ioctls); SET_VALID_IOCTL(ops, VIDIOC_OVERLAY, vidioc_overlay); SET_VALID_IOCTL(ops, VIDIOC_G_FBUF, vidioc_g_fbuf); SET_VALID_IOCTL(ops, VIDIOC_S_FBUF, vidioc_s_fbuf); SET_VALID_IOCTL(ops, VIDIOC_G_JPEGCOMP, vidioc_g_jpegcomp); SET_VALID_IOCTL(ops, VIDIOC_S_JPEGCOMP, vidioc_s_jpegcomp); SET_VALID_IOCTL(ops, VIDIOC_G_ENC_INDEX, vidioc_g_enc_index); SET_VALID_IOCTL(ops, VIDIOC_ENCODER_CMD, vidioc_encoder_cmd); SET_VALID_IOCTL(ops, VIDIOC_TRY_ENCODER_CMD, vidioc_try_encoder_cmd); SET_VALID_IOCTL(ops, VIDIOC_DECODER_CMD, vidioc_decoder_cmd); SET_VALID_IOCTL(ops, VIDIOC_TRY_DECODER_CMD, vidioc_try_decoder_cmd); SET_VALID_IOCTL(ops, VIDIOC_ENUM_FRAMESIZES, vidioc_enum_framesizes); SET_VALID_IOCTL(ops, VIDIOC_ENUM_FRAMEINTERVALS, vidioc_enum_frameintervals); if (ops->vidioc_g_selection && !test_bit(_IOC_NR(VIDIOC_G_SELECTION), vdev->valid_ioctls)) { __set_bit(_IOC_NR(VIDIOC_G_CROP), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_CROPCAP), valid_ioctls); } if (ops->vidioc_s_selection && !test_bit(_IOC_NR(VIDIOC_S_SELECTION), vdev->valid_ioctls)) __set_bit(_IOC_NR(VIDIOC_S_CROP), valid_ioctls); SET_VALID_IOCTL(ops, VIDIOC_G_SELECTION, vidioc_g_selection); SET_VALID_IOCTL(ops, VIDIOC_S_SELECTION, vidioc_s_selection); } if (is_meta && is_rx) { /* metadata capture specific ioctls */ SET_VALID_IOCTL(ops, VIDIOC_ENUM_FMT, vidioc_enum_fmt_meta_cap); SET_VALID_IOCTL(ops, VIDIOC_G_FMT, vidioc_g_fmt_meta_cap); SET_VALID_IOCTL(ops, VIDIOC_S_FMT, vidioc_s_fmt_meta_cap); SET_VALID_IOCTL(ops, VIDIOC_TRY_FMT, vidioc_try_fmt_meta_cap); } else if (is_meta && is_tx) { /* metadata output specific ioctls */ SET_VALID_IOCTL(ops, VIDIOC_ENUM_FMT, vidioc_enum_fmt_meta_out); SET_VALID_IOCTL(ops, VIDIOC_G_FMT, vidioc_g_fmt_meta_out); SET_VALID_IOCTL(ops, VIDIOC_S_FMT, vidioc_s_fmt_meta_out); SET_VALID_IOCTL(ops, VIDIOC_TRY_FMT, vidioc_try_fmt_meta_out); } if (is_vbi) { /* vbi specific ioctls */ if ((is_rx && (ops->vidioc_g_fmt_vbi_cap || ops->vidioc_g_fmt_sliced_vbi_cap)) || (is_tx && (ops->vidioc_g_fmt_vbi_out || ops->vidioc_g_fmt_sliced_vbi_out))) __set_bit(_IOC_NR(VIDIOC_G_FMT), valid_ioctls); if ((is_rx && (ops->vidioc_s_fmt_vbi_cap || ops->vidioc_s_fmt_sliced_vbi_cap)) || (is_tx && (ops->vidioc_s_fmt_vbi_out || ops->vidioc_s_fmt_sliced_vbi_out))) __set_bit(_IOC_NR(VIDIOC_S_FMT), valid_ioctls); if ((is_rx && (ops->vidioc_try_fmt_vbi_cap || ops->vidioc_try_fmt_sliced_vbi_cap)) || (is_tx && (ops->vidioc_try_fmt_vbi_out || ops->vidioc_try_fmt_sliced_vbi_out))) __set_bit(_IOC_NR(VIDIOC_TRY_FMT), valid_ioctls); SET_VALID_IOCTL(ops, VIDIOC_G_SLICED_VBI_CAP, vidioc_g_sliced_vbi_cap); } else if (is_tch) { /* touch specific ioctls */ SET_VALID_IOCTL(ops, VIDIOC_ENUM_FMT, vidioc_enum_fmt_vid_cap); SET_VALID_IOCTL(ops, VIDIOC_G_FMT, vidioc_g_fmt_vid_cap); SET_VALID_IOCTL(ops, VIDIOC_S_FMT, vidioc_s_fmt_vid_cap); SET_VALID_IOCTL(ops, VIDIOC_TRY_FMT, vidioc_try_fmt_vid_cap); SET_VALID_IOCTL(ops, VIDIOC_ENUM_FRAMESIZES, vidioc_enum_framesizes); SET_VALID_IOCTL(ops, VIDIOC_ENUM_FRAMEINTERVALS, vidioc_enum_frameintervals); SET_VALID_IOCTL(ops, VIDIOC_ENUMINPUT, vidioc_enum_input); SET_VALID_IOCTL(ops, VIDIOC_G_INPUT, vidioc_g_input); SET_VALID_IOCTL(ops, VIDIOC_S_INPUT, vidioc_s_input); SET_VALID_IOCTL(ops, VIDIOC_G_PARM, vidioc_g_parm); SET_VALID_IOCTL(ops, VIDIOC_S_PARM, vidioc_s_parm); } else if (is_sdr && is_rx) { /* SDR receiver specific ioctls */ SET_VALID_IOCTL(ops, VIDIOC_ENUM_FMT, vidioc_enum_fmt_sdr_cap); SET_VALID_IOCTL(ops, VIDIOC_G_FMT, vidioc_g_fmt_sdr_cap); SET_VALID_IOCTL(ops, VIDIOC_S_FMT, vidioc_s_fmt_sdr_cap); SET_VALID_IOCTL(ops, VIDIOC_TRY_FMT, vidioc_try_fmt_sdr_cap); } else if (is_sdr && is_tx) { /* SDR transmitter specific ioctls */ SET_VALID_IOCTL(ops, VIDIOC_ENUM_FMT, vidioc_enum_fmt_sdr_out); SET_VALID_IOCTL(ops, VIDIOC_G_FMT, vidioc_g_fmt_sdr_out); SET_VALID_IOCTL(ops, VIDIOC_S_FMT, vidioc_s_fmt_sdr_out); SET_VALID_IOCTL(ops, VIDIOC_TRY_FMT, vidioc_try_fmt_sdr_out); } if (has_streaming) { /* ioctls valid for streaming I/O */ SET_VALID_IOCTL(ops, VIDIOC_REQBUFS, vidioc_reqbufs); SET_VALID_IOCTL(ops, VIDIOC_QUERYBUF, vidioc_querybuf); SET_VALID_IOCTL(ops, VIDIOC_QBUF, vidioc_qbuf); SET_VALID_IOCTL(ops, VIDIOC_EXPBUF, vidioc_expbuf); SET_VALID_IOCTL(ops, VIDIOC_DQBUF, vidioc_dqbuf); SET_VALID_IOCTL(ops, VIDIOC_CREATE_BUFS, vidioc_create_bufs); SET_VALID_IOCTL(ops, VIDIOC_PREPARE_BUF, vidioc_prepare_buf); SET_VALID_IOCTL(ops, VIDIOC_STREAMON, vidioc_streamon); SET_VALID_IOCTL(ops, VIDIOC_STREAMOFF, vidioc_streamoff); } if (is_vid || is_vbi || is_meta) { /* ioctls valid for video, vbi and metadata */ if (ops->vidioc_s_std) __set_bit(_IOC_NR(VIDIOC_ENUMSTD), valid_ioctls); SET_VALID_IOCTL(ops, VIDIOC_S_STD, vidioc_s_std); SET_VALID_IOCTL(ops, VIDIOC_G_STD, vidioc_g_std); if (is_rx) { SET_VALID_IOCTL(ops, VIDIOC_QUERYSTD, vidioc_querystd); if (is_io_mc) { __set_bit(_IOC_NR(VIDIOC_ENUMINPUT), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_G_INPUT), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_S_INPUT), valid_ioctls); } else { SET_VALID_IOCTL(ops, VIDIOC_ENUMINPUT, vidioc_enum_input); SET_VALID_IOCTL(ops, VIDIOC_G_INPUT, vidioc_g_input); SET_VALID_IOCTL(ops, VIDIOC_S_INPUT, vidioc_s_input); } SET_VALID_IOCTL(ops, VIDIOC_ENUMAUDIO, vidioc_enumaudio); SET_VALID_IOCTL(ops, VIDIOC_G_AUDIO, vidioc_g_audio); SET_VALID_IOCTL(ops, VIDIOC_S_AUDIO, vidioc_s_audio); SET_VALID_IOCTL(ops, VIDIOC_QUERY_DV_TIMINGS, vidioc_query_dv_timings); SET_VALID_IOCTL(ops, VIDIOC_S_EDID, vidioc_s_edid); } if (is_tx) { if (is_io_mc) { __set_bit(_IOC_NR(VIDIOC_ENUMOUTPUT), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_G_OUTPUT), valid_ioctls); __set_bit(_IOC_NR(VIDIOC_S_OUTPUT), valid_ioctls); } else { SET_VALID_IOCTL(ops, VIDIOC_ENUMOUTPUT, vidioc_enum_output); SET_VALID_IOCTL(ops, VIDIOC_G_OUTPUT, vidioc_g_output); SET_VALID_IOCTL(ops, VIDIOC_S_OUTPUT, vidioc_s_output); } SET_VALID_IOCTL(ops, VIDIOC_ENUMAUDOUT, vidioc_enumaudout); SET_VALID_IOCTL(ops, VIDIOC_G_AUDOUT, vidioc_g_audout); SET_VALID_IOCTL(ops, VIDIOC_S_AUDOUT, vidioc_s_audout); } if (ops->vidioc_g_parm || ops->vidioc_g_std) __set_bit(_IOC_NR(VIDIOC_G_PARM), valid_ioctls); SET_VALID_IOCTL(ops, VIDIOC_S_PARM, vidioc_s_parm); SET_VALID_IOCTL(ops, VIDIOC_S_DV_TIMINGS, vidioc_s_dv_timings); SET_VALID_IOCTL(ops, VIDIOC_G_DV_TIMINGS, vidioc_g_dv_timings); SET_VALID_IOCTL(ops, VIDIOC_ENUM_DV_TIMINGS, vidioc_enum_dv_timings); SET_VALID_IOCTL(ops, VIDIOC_DV_TIMINGS_CAP, vidioc_dv_timings_cap); SET_VALID_IOCTL(ops, VIDIOC_G_EDID, vidioc_g_edid); } if (is_tx && (is_radio || is_sdr)) { /* radio transmitter only ioctls */ SET_VALID_IOCTL(ops, VIDIOC_G_MODULATOR, vidioc_g_modulator); SET_VALID_IOCTL(ops, VIDIOC_S_MODULATOR, vidioc_s_modulator); } if (is_rx && !is_tch) { /* receiver only ioctls */ SET_VALID_IOCTL(ops, VIDIOC_G_TUNER, vidioc_g_tuner); SET_VALID_IOCTL(ops, VIDIOC_S_TUNER, vidioc_s_tuner); SET_VALID_IOCTL(ops, VIDIOC_S_HW_FREQ_SEEK, vidioc_s_hw_freq_seek); } bitmap_andnot(vdev->valid_ioctls, valid_ioctls, vdev->valid_ioctls, BASE_VIDIOC_PRIVATE); } static int video_register_media_controller(struct video_device *vdev) { #if defined(CONFIG_MEDIA_CONTROLLER) u32 intf_type; int ret; /* Memory-to-memory devices are more complex and use * their own function to register its mc entities. */ if (!vdev->v4l2_dev->mdev || vdev->vfl_dir == VFL_DIR_M2M) return 0; vdev->entity.obj_type = MEDIA_ENTITY_TYPE_VIDEO_DEVICE; vdev->entity.function = MEDIA_ENT_F_UNKNOWN; switch (vdev->vfl_type) { case VFL_TYPE_VIDEO: intf_type = MEDIA_INTF_T_V4L_VIDEO; vdev->entity.function = MEDIA_ENT_F_IO_V4L; break; case VFL_TYPE_VBI: intf_type = MEDIA_INTF_T_V4L_VBI; vdev->entity.function = MEDIA_ENT_F_IO_VBI; break; case VFL_TYPE_SDR: intf_type = MEDIA_INTF_T_V4L_SWRADIO; vdev->entity.function = MEDIA_ENT_F_IO_SWRADIO; break; case VFL_TYPE_TOUCH: intf_type = MEDIA_INTF_T_V4L_TOUCH; vdev->entity.function = MEDIA_ENT_F_IO_V4L; break; case VFL_TYPE_RADIO: intf_type = MEDIA_INTF_T_V4L_RADIO; /* * Radio doesn't have an entity at the V4L2 side to represent * radio input or output. Instead, the audio input/output goes * via either physical wires or ALSA. */ break; case VFL_TYPE_SUBDEV: intf_type = MEDIA_INTF_T_V4L_SUBDEV; /* Entity will be created via v4l2_device_register_subdev() */ break; default: return 0; } if (vdev->entity.function != MEDIA_ENT_F_UNKNOWN) { vdev->entity.name = vdev->name; /* Needed just for backward compatibility with legacy MC API */ vdev->entity.info.dev.major = VIDEO_MAJOR; vdev->entity.info.dev.minor = vdev->minor; ret = media_device_register_entity(vdev->v4l2_dev->mdev, &vdev->entity); if (ret < 0) { pr_warn("%s: media_device_register_entity failed\n", __func__); return ret; } } vdev->intf_devnode = media_devnode_create(vdev->v4l2_dev->mdev, intf_type, 0, VIDEO_MAJOR, vdev->minor); if (!vdev->intf_devnode) { media_device_unregister_entity(&vdev->entity); return -ENOMEM; } if (vdev->entity.function != MEDIA_ENT_F_UNKNOWN) { struct media_link *link; link = media_create_intf_link(&vdev->entity, &vdev->intf_devnode->intf, MEDIA_LNK_FL_ENABLED | MEDIA_LNK_FL_IMMUTABLE); if (!link) { media_devnode_remove(vdev->intf_devnode); media_device_unregister_entity(&vdev->entity); return -ENOMEM; } } /* FIXME: how to create the other interface links? */ #endif return 0; } int __video_register_device(struct video_device *vdev, enum vfl_devnode_type type, int nr, int warn_if_nr_in_use, struct module *owner) { int i = 0; int ret; int minor_offset = 0; int minor_cnt = VIDEO_NUM_DEVICES; const char *name_base; /* A minor value of -1 marks this video device as never having been registered */ vdev->minor = -1; /* the release callback MUST be present */ if (WARN_ON(!vdev->release)) return -EINVAL; /* the v4l2_dev pointer MUST be present */ if (WARN_ON(!vdev->v4l2_dev)) return -EINVAL; /* the device_caps field MUST be set for all but subdevs */ if (WARN_ON(type != VFL_TYPE_SUBDEV && !vdev->device_caps)) return -EINVAL; /* v4l2_fh support */ spin_lock_init(&vdev->fh_lock); INIT_LIST_HEAD(&vdev->fh_list); /* Part 1: check device type */ switch (type) { case VFL_TYPE_VIDEO: name_base = "video"; break; case VFL_TYPE_VBI: name_base = "vbi"; break; case VFL_TYPE_RADIO: name_base = "radio"; break; case VFL_TYPE_SUBDEV: name_base = "v4l-subdev"; break; case VFL_TYPE_SDR: /* Use device name 'swradio' because 'sdr' was already taken. */ name_base = "swradio"; break; case VFL_TYPE_TOUCH: name_base = "v4l-touch"; break; default: pr_err("%s called with unknown type: %d\n", __func__, type); return -EINVAL; } vdev->vfl_type = type; vdev->cdev = NULL; if (vdev->dev_parent == NULL) vdev->dev_parent = vdev->v4l2_dev->dev; if (vdev->ctrl_handler == NULL) vdev->ctrl_handler = vdev->v4l2_dev->ctrl_handler; /* If the prio state pointer is NULL, then use the v4l2_device prio state. */ if (vdev->prio == NULL) vdev->prio = &vdev->v4l2_dev->prio; /* Part 2: find a free minor, device node number and device index. */ #ifdef CONFIG_VIDEO_FIXED_MINOR_RANGES /* Keep the ranges for the first four types for historical * reasons. * Newer devices (not yet in place) should use the range * of 128-191 and just pick the first free minor there * (new style). */ switch (type) { case VFL_TYPE_VIDEO: minor_offset = 0; minor_cnt = 64; break; case VFL_TYPE_RADIO: minor_offset = 64; minor_cnt = 64; break; case VFL_TYPE_VBI: minor_offset = 224; minor_cnt = 32; break; default: minor_offset = 128; minor_cnt = 64; break; } #endif /* Pick a device node number */ mutex_lock(&videodev_lock); nr = devnode_find(vdev, nr == -1 ? 0 : nr, minor_cnt); if (nr == minor_cnt) nr = devnode_find(vdev, 0, minor_cnt); if (nr == minor_cnt) { pr_err("could not get a free device node number\n"); mutex_unlock(&videodev_lock); return -ENFILE; } #ifdef CONFIG_VIDEO_FIXED_MINOR_RANGES /* 1-on-1 mapping of device node number to minor number */ i = nr; #else /* The device node number and minor numbers are independent, so we just find the first free minor number. */ for (i = 0; i < VIDEO_NUM_DEVICES; i++) if (video_devices[i] == NULL) break; if (i == VIDEO_NUM_DEVICES) { mutex_unlock(&videodev_lock); pr_err("could not get a free minor\n"); return -ENFILE; } #endif vdev->minor = i + minor_offset; vdev->num = nr; /* Should not happen since we thought this minor was free */ if (WARN_ON(video_devices[vdev->minor])) { mutex_unlock(&videodev_lock); pr_err("video_device not empty!\n"); return -ENFILE; } devnode_set(vdev); vdev->index = get_index(vdev); video_devices[vdev->minor] = vdev; mutex_unlock(&videodev_lock); if (vdev->ioctl_ops) determine_valid_ioctls(vdev); /* Part 3: Initialize the character device */ vdev->cdev = cdev_alloc(); if (vdev->cdev == NULL) { ret = -ENOMEM; goto cleanup; } vdev->cdev->ops = &v4l2_fops; vdev->cdev->owner = owner; ret = cdev_add(vdev->cdev, MKDEV(VIDEO_MAJOR, vdev->minor), 1); if (ret < 0) { pr_err("%s: cdev_add failed\n", __func__); kfree(vdev->cdev); vdev->cdev = NULL; goto cleanup; } /* Part 4: register the device with sysfs */ vdev->dev.class = &video_class; vdev->dev.devt = MKDEV(VIDEO_MAJOR, vdev->minor); vdev->dev.parent = vdev->dev_parent; dev_set_name(&vdev->dev, "%s%d", name_base, vdev->num); ret = device_register(&vdev->dev); if (ret < 0) { pr_err("%s: device_register failed\n", __func__); goto cleanup; } /* Register the release callback that will be called when the last reference to the device goes away. */ vdev->dev.release = v4l2_device_release; if (nr != -1 && nr != vdev->num && warn_if_nr_in_use) pr_warn("%s: requested %s%d, got %s\n", __func__, name_base, nr, video_device_node_name(vdev)); /* Increase v4l2_device refcount */ v4l2_device_get(vdev->v4l2_dev); /* Part 5: Register the entity. */ ret = video_register_media_controller(vdev); /* Part 6: Activate this minor. The char device can now be used. */ set_bit(V4L2_FL_REGISTERED, &vdev->flags); return 0; cleanup: mutex_lock(&videodev_lock); if (vdev->cdev) cdev_del(vdev->cdev); video_devices[vdev->minor] = NULL; devnode_clear(vdev); mutex_unlock(&videodev_lock); /* Mark this video device as never having been registered. */ vdev->minor = -1; return ret; } EXPORT_SYMBOL(__video_register_device); /** * video_unregister_device - unregister a video4linux device * @vdev: the device to unregister * * This unregisters the passed device. Future open calls will * be met with errors. */ void video_unregister_device(struct video_device *vdev) { /* Check if vdev was ever registered at all */ if (!vdev || !video_is_registered(vdev)) return; mutex_lock(&videodev_lock); /* This must be in a critical section to prevent a race with v4l2_open. * Once this bit has been cleared video_get may never be called again. */ clear_bit(V4L2_FL_REGISTERED, &vdev->flags); mutex_unlock(&videodev_lock); if (test_bit(V4L2_FL_USES_V4L2_FH, &vdev->flags)) v4l2_event_wake_all(vdev); device_unregister(&vdev->dev); } EXPORT_SYMBOL(video_unregister_device); #if defined(CONFIG_MEDIA_CONTROLLER) __must_check int video_device_pipeline_start(struct video_device *vdev, struct media_pipeline *pipe) { struct media_entity *entity = &vdev->entity; if (entity->num_pads != 1) return -ENODEV; return media_pipeline_start(&entity->pads[0], pipe); } EXPORT_SYMBOL_GPL(video_device_pipeline_start); __must_check int __video_device_pipeline_start(struct video_device *vdev, struct media_pipeline *pipe) { struct media_entity *entity = &vdev->entity; if (entity->num_pads != 1) return -ENODEV; return __media_pipeline_start(&entity->pads[0], pipe); } EXPORT_SYMBOL_GPL(__video_device_pipeline_start); void video_device_pipeline_stop(struct video_device *vdev) { struct media_entity *entity = &vdev->entity; if (WARN_ON(entity->num_pads != 1)) return; return media_pipeline_stop(&entity->pads[0]); } EXPORT_SYMBOL_GPL(video_device_pipeline_stop); void __video_device_pipeline_stop(struct video_device *vdev) { struct media_entity *entity = &vdev->entity; if (WARN_ON(entity->num_pads != 1)) return; return __media_pipeline_stop(&entity->pads[0]); } EXPORT_SYMBOL_GPL(__video_device_pipeline_stop); __must_check int video_device_pipeline_alloc_start(struct video_device *vdev) { struct media_entity *entity = &vdev->entity; if (entity->num_pads != 1) return -ENODEV; return media_pipeline_alloc_start(&entity->pads[0]); } EXPORT_SYMBOL_GPL(video_device_pipeline_alloc_start); struct media_pipeline *video_device_pipeline(struct video_device *vdev) { struct media_entity *entity = &vdev->entity; if (WARN_ON(entity->num_pads != 1)) return NULL; return media_pad_pipeline(&entity->pads[0]); } EXPORT_SYMBOL_GPL(video_device_pipeline); #endif /* CONFIG_MEDIA_CONTROLLER */ /* * Initialise video for linux */ static int __init videodev_init(void) { dev_t dev = MKDEV(VIDEO_MAJOR, 0); int ret; pr_info("Linux video capture interface: v2.00\n"); ret = register_chrdev_region(dev, VIDEO_NUM_DEVICES, VIDEO_NAME); if (ret < 0) { pr_warn("videodev: unable to get major %d\n", VIDEO_MAJOR); return ret; } ret = class_register(&video_class); if (ret < 0) { unregister_chrdev_region(dev, VIDEO_NUM_DEVICES); pr_warn("video_dev: class_register failed\n"); return -EIO; } return 0; } static void __exit videodev_exit(void) { dev_t dev = MKDEV(VIDEO_MAJOR, 0); class_unregister(&video_class); unregister_chrdev_region(dev, VIDEO_NUM_DEVICES); } subsys_initcall(videodev_init); module_exit(videodev_exit) MODULE_AUTHOR("Alan Cox, Mauro Carvalho Chehab <mchehab@kernel.org>, Bill Dirks, Justin Schoeman, Gerd Knorr"); MODULE_DESCRIPTION("Video4Linux2 core driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV_MAJOR(VIDEO_MAJOR); |
9 6 3 12 12 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #include "gateway_common.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/stddef.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "gateway_client.h" #include "tvlv.h" /** * batadv_gw_tvlv_container_update() - update the gw tvlv container after * gateway setting change * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) { struct batadv_tvlv_gateway_data gw; u32 down, up; char gw_mode; gw_mode = atomic_read(&bat_priv->gw.mode); switch (gw_mode) { case BATADV_GW_MODE_OFF: case BATADV_GW_MODE_CLIENT: batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_GW, 1); break; case BATADV_GW_MODE_SERVER: down = atomic_read(&bat_priv->gw.bandwidth_down); up = atomic_read(&bat_priv->gw.bandwidth_up); gw.bandwidth_down = htonl(down); gw.bandwidth_up = htonl(up); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_GW, 1, &gw, sizeof(gw)); break; } } /** * batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data * @tvlv_value_len: tvlv buffer length */ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_gateway_data gateway, *gateway_ptr; /* only fetch the tvlv value if the handler wasn't called via the * CIFNOTFND flag and if there is data to fetch */ if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND || tvlv_value_len < sizeof(gateway)) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } else { gateway_ptr = tvlv_value; gateway.bandwidth_down = gateway_ptr->bandwidth_down; gateway.bandwidth_up = gateway_ptr->bandwidth_up; if (gateway.bandwidth_down == 0 || gateway.bandwidth_up == 0) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } } batadv_gw_node_update(bat_priv, orig, &gateway); /* restart gateway selection */ if (gateway.bandwidth_down != 0 && atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) batadv_gw_check_election(bat_priv, orig); } /** * batadv_gw_init() - initialise the gateway handling internals * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_init(struct batadv_priv *bat_priv) { if (bat_priv->algo_ops->gw.init_sel_class) bat_priv->algo_ops->gw.init_sel_class(bat_priv); else atomic_set(&bat_priv->gw.sel_class, 1); batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1, NULL, NULL, BATADV_TVLV_GW, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } /** * batadv_gw_free() - free the gateway handling internals * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_free(struct batadv_priv *bat_priv) { batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_GW, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_GW, 1); } |
1 1339 1338 123 122 3 1 24 24 24 24 103 103 103 103 103 1 1 62 1 6 6 18 15 1 6 2 8 14 18 18 16 28 16 3 1 1 1 1 1 1 3 2 21 7 29 28 29 16 2 2 1 12 1 1 11 12 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, { } }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); } } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk) { struct core_state *core_state; /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ spin_lock_irq(&tsk->sighand->siglock); tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); /* The vhost_worker does not particpate in coredumps */ if (core_state && ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) { struct core_thread self; self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } } #ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *c, *g, *p = current; retry: /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(c, &p->children, sibling) { if (c->mm == mm) goto assign_new_owner; } /* * Search in the siblings */ list_for_each_entry(c, &p->real_parent->children, sibling) { if (c->mm == mm) goto assign_new_owner; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { if (c->mm == mm) goto assign_new_owner; if (c->mm) break; } } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); /* * Delay read_unlock() till we have the task_lock() * to ensure that c does not slip away underneath us */ read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); put_task_struct(c); goto retry; } WRITE_ONCE(mm->owner, c); lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; /* * sub-thread or delay_group_leader(), wake up the * PIDFD_THREAD waiters. */ if (!thread_group_empty(tsk)) do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } spin_unlock_irq(&sighand->siglock); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort); |
364 477 6 477 475 475 257 475 279 515 158 179 436 4 433 335 333 2 148 265 2 226 264 309 3 381 314 743 742 381 1096 278 913 1137 262 963 2862 381 2589 1099 259 865 1075 221 886 460 169 306 857 62 801 896 121 795 911 96 843 281 32 260 55 9 46 262 119 243 262 262 262 196 66 262 262 262 196 66 138 14 127 1 128 186 49 160 3 38 127 13 3 11 1 11 20 57 35 22 22 406 4 406 93 1 93 29 3 29 33 2 31 30 1 30 1 21 21 10 81 9 2 75 75 1 8 17 14 3 23 195 195 12 75 84 82 12 15 6 9 9 9 1 8 1 8 9 9 9 9 9 9 1151 1152 462 226 105 237 462 409 408 409 4 409 397 43 409 29 24 2 3 36 499 370 368 368 32 1 1 31 34 33 1 18 17 1 355 337 349 349 11 12 3 19 21 15 23 1 3 58 329 54 15 2 13 1 41 12 9 5 13 1 9 11 10 14 1 20 17 1 8 2 1 18 18 3 17 19 175 25 171 119 309 140 155 40 19 23 38 157 38 338 5 286 54 57 56 45 9 285 39 8 5 320 5 318 30 292 15 308 311 2 28 3 331 2 2 11 324 11 350 350 349 1 2 346 327 3 1 293 39 40 40 343 335 335 334 334 335 334 334 335 335 335 335 334 335 131 131 2 130 2 25 25 131 131 132 132 25 25 25 25 132 131 2 130 6 132 132 132 131 3 132 2 130 3 132 5 7 132 132 132 3 132 132 132 132 132 215 18 33 2 7 156 154 165 164 1 164 185 31 53 53 53 70 17 53 53 53 53 1394 1395 1261 80 69 261 2 1438 1437 247 256 255 256 253 3 252 4 1 255 252 3 249 5 2 251 251 248 3 3 248 248 24 95 151 246 228 165 50 42 247 243 4 243 228 304 1 61 5 2 258 2 2 29 32 4 30 156 10 6 7 156 158 205 2 45 159 158 76 156 159 76 156 162 20 20 20 20 20 20 20 20 20 182 182 182 182 182 170 2 9 204 115 89 201 113 279 34 224 89 241 72 273 40 313 239 33 41 258 55 69 244 66 247 313 273 1 82 1 189 1 2 5 11 253 247 25 1 1 1 69 210 4 2 1 3 2 298 1 313 2 314 2 314 1 4 1 312 192 297 278 13 6 278 2 16 1 279 15 5 275 13 6 1 282 280 1 1 280 264 1 263 1 261 261 260 260 258 2 258 164 92 1 257 256 257 1 256 14 14 12 298 45 1 20 24 44 253 322 3 1 1 1 1 207 109 108 4 274 146 30 38 6 54 280 280 87 193 2 2 2 274 17 304 322 322 3 316 313 313 309 10 299 1 17 6 10 6 11 215 32 155 3 88 197 5 185 14 51 1 3 3 3 3 214 22 69 124 215 163 52 182 8 3 187 16 28 169 7 197 2 195 125 70 150 45 181 181 69 113 32 32 2 131 133 133 322 320 3 322 180 181 140 325 20 7 1 12 3 3 6 3 2 1 1 3 12 2 2 9 9 3 2 339 339 41 306 340 340 340 176 125 215 262 119 226 340 340 339 338 339 2 5 5 50 1 49 18 23 18 23 26 27 23 37 7 1 30 7 35 2 30 6 2 2 28 5 29 5 11 2 5 5 1 12 14 12 2 2 8 24 15 10 16 1 1 6 18 2 11 10 20 17 17 17 42 5 37 17 20 111 2 109 111 111 111 111 93 93 2 93 126 126 1 102 34 93 93 2 2 3 1 1 1 3 20 61 5 56 72 66 58 61 49 23 30 10 20 22 13 154 155 2 22 154 113 3 9 112 110 35 3 4 3 2 2 150 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> #include <linux/random.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" #include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering * * page fault path: * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> * page lock * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) */ static const struct fs_context_operations ext4_context_ops = { .parse_param = ext4_parse_param, .get_tree = ext4_get_tree, .reconfigure = ext4_reconfigure, .free = ext4_fc_free, }; #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we * recheck the buffer contents. */ clear_buffer_verified(bh); bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return; } __ext4_read_bh(bh, op_flags, end_io); } int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return 0; } __ext4_read_bh(bh, op_flags, end_io); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; return -EIO; } int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL); return 0; } return ext4_read_bh(bh, op_flags, NULL); } /* * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, sector_t block, blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); brelse(bh); } } static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; csum = ext4_chksum(sbi, ~0, (char *)es, offset); return cpu_to_le32(csum); } static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_inodes_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_table_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); } void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); } void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); } void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); *hi = upper_32_bits(now); } static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) { return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) #define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. * * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: * 1. More than an hour has passed since the last superblock update, and * 2. More than 16MB have been written since the last superblock update. * * @sb: The superblock */ static void ext4_maybe_update_superblock(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; journal_t *journal = sbi->s_journal; time64_t now; __u64 last_update; __u64 lifetime_write_kbytes; __u64 diff_size; if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || !journal || (journal->j_flags & JBD2_UNMOUNT)) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) return; lifetime_write_kbytes = sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); /* Get the number of kilobytes not written to disk to account * for statistics and compare with a multiple of 16 MB. This * is used to determine when the next superblock commit should * occur (i.e. not more often than once per 16MB if there was * less written in an hour). */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone * else. Once this happens, any attempt to call mark_buffer_dirty() * (for example, by ext4_commit_super), will cause a kernel OOPS. * This is a kludge to prevent these oops until we can put in a proper * hook in del_gendisk() to inform the VFS and file system layers. */ static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); spin_lock(&sbi->s_md_lock); } spin_unlock(&sbi->s_md_lock); } /* * This writepage callback for write_cache_pages() * takes care of a few cases after page cleaning. * * write_cache_pages() already checks for dirty pages * and calls clear_page_dirty_for_io(), which we want, * to write protect the pages. * * However, we may have to redirty a page (see below.) */ static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: * 1) If buffer is dirty, it means the page was dirty because it * contains a buffer that needs checkpointing. So the dirty bit * needs to be preserved so that checkpointing writes the buffer * properly. * 2) If buffer is not part of the committing transaction * (we may have just accidentally come across this buffer because * inode range tracking is not exact) or if the currently running * transaction already contains this buffer as well, dirty bit * needs to be preserved so that the buffer gets writeprotected * properly on running transaction's commit. */ jh = bh2jh(bh); if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); out: return AOP_WRITEPAGE_ACTIVATE; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; return write_cache_pages(mapping, &wbc, ext4_journalled_writepage_callback, jinode->i_transaction); } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { int ret; if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { int ret = 0; if (!ext4_should_journal_data(jinode->i_vfs_inode)) ret = jbd2_journal_finish_inode_data_buffers(jinode); return ret; } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } struct ext4_err_translation { int code; int errno; }; #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } static struct ext4_err_translation err_translation[] = { EXT4_ERR_TRANSLATE(EIO), EXT4_ERR_TRANSLATE(ENOMEM), EXT4_ERR_TRANSLATE(EFSBADCRC), EXT4_ERR_TRANSLATE(EFSCORRUPTED), EXT4_ERR_TRANSLATE(ENOSPC), EXT4_ERR_TRANSLATE(ENOKEY), EXT4_ERR_TRANSLATE(EROFS), EXT4_ERR_TRANSLATE(EFBIG), EXT4_ERR_TRANSLATE(EEXIST), EXT4_ERR_TRANSLATE(ERANGE), EXT4_ERR_TRANSLATE(EOVERFLOW), EXT4_ERR_TRANSLATE(EBUSY), EXT4_ERR_TRANSLATE(ENOTDIR), EXT4_ERR_TRANSLATE(ENOTEMPTY), EXT4_ERR_TRANSLATE(ESHUTDOWN), EXT4_ERR_TRANSLATE(EFAULT), }; static int ext4_errno_to_code(int errno) { int i; for (i = 0; i < ARRAY_SIZE(err_translation); i++) if (err_translation[i].errno == errno) return err_translation[i].code; return EXT4_ERR_UNKNOWN; } static void save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; spin_lock(&sbi->s_error_lock); sbi->s_add_error_count++; sbi->s_last_error_code = error; sbi->s_last_error_line = line; sbi->s_last_error_ino = ino; sbi->s_last_error_block = block; sbi->s_last_error_func = func; sbi->s_last_error_time = ktime_get_real_seconds(); if (!sbi->s_first_error_time) { sbi->s_first_error_code = error; sbi->s_first_error_line = line; sbi->s_first_error_ino = ino; sbi->s_first_error_block = block; sbi->s_first_error_func = func; sbi->s_first_error_time = sbi->s_last_error_time; } spin_unlock(&sbi->s_error_lock); } /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * * On ext2, we can store the error state of the filesystem in the * superblock. That is not possible on ext4, because we may have other * write ordering constraints on the superblock which prevent us from * writing it out straight away; and given that the journal is about to * be aborted, we can't rely on the current, or future, transactions to * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. * * If force_ro is set, we unconditionally force the filesystem into an * ABORT|READONLY state, unless the error response on the fs has been set to * panic in which case we take the easy way out and panic immediately. This is * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); /* * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } if (sb_rdonly(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before * ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; } static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; /* * If the journal is still running, we have to write out superblock * through the journal to avoid collisions of other journalled sb * updates. * * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } if (sbi->s_add_error_count > 0) call_notify_err = true; ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } jbd2_journal_stop(handle); if (call_notify_err) ext4_notify_error_sysfs(sbi); return; } write_directly: /* * Write through journal failed. Write sb directly to get error info * out and hope for the best. */ ext4_commit_super(sbi->s_sb); ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); } fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } void __ext4_error_file(struct file *file, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; struct va_format vaf; struct inode *inode = file_inode(file); char pathname[80], *path; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; switch (errno) { case -EFSCORRUPTED: errstr = "Corrupt filesystem"; break; case -EFSBADCRC: errstr = "Filesystem failed CRC"; break; case -EIO: errstr = "IO failure"; break; case -ENOMEM: errstr = "Out of memory"; break; case -EROFS: if (!sb || (EXT4_SB(sb)->s_journal && EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; break; default: /* If the caller passed in an extra buffer for unknown * errors, textualise them now. Else we just return * NULL. */ if (nbuf) { /* Check for truncated error codes... */ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) errstr = nbuf; } break; } return errstr; } /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ void __ext4_std_error(struct super_block *sb, const char *function, unsigned int line, int errno) { char nbuf[16]; const char *errstr; if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; if (sb) { atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); else printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } static int ext4_warning_ratelimit(struct super_block *sb) { atomic_inc(&EXT4_SB(sb)->s_warning_count); return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), "EXT4-fs warning"); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", sb->s_id, function, line, &vaf); va_end(args); } void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(inode->i_sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); printk(KERN_CONT "%pV\n", &vaf); va_end(args); } if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } ext4_unlock_group(sb, grp); ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the * filesystem will have already been marked read/only and the * journal has been aborted. We return 1 as a hint to callers * who might what to use the return value from * ext4_grp_locked_error() to distinguish between the * ERRORS_CONT and ERRORS_RO case, and perhaps return more * aggressively from the ext4 function in question, with a * more appropriate error code. */ ext4_lock_group(sb, grp); return; } void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t group, unsigned int flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; if (!grp || !gdp) return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); } if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } } } void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); /* leave es->s_feature_*compat flags alone */ /* es->s_uuid will be set by e2fsck if empty */ /* * The rest of the superblock fields should be zero, and if not it * means they are likely already in use, so leave them alone. We * can leave it up to e2fsck to clean up any inconsistencies there. */ } static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; } static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; ext4_msg(sb, KERN_ERR, "sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); static inline void ext4_quotas_off(struct super_block *sb, int type) { BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ for (type--; type >= 0; type--) ext4_quota_off(sb, type); } /* * This is a helper function which is used in the mount/remount * codepaths (which holds s_umount) to fetch the quota file name. */ static inline char *get_qf_name(struct super_block *sb, struct ext4_sb_info *sbi, int type) { return rcu_dereference_protected(sbi->s_qf_names[type], lockdep_is_held(&sb->s_umount)); } #else static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif static int ext4_percpu_param_init(struct ext4_sb_info *sbi) { ext4_fsblk_t block; int err; block = ext4_count_free_clusters(sbi->s_sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sbi->s_sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sbi->s_sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); return err; } static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); } static void ext4_group_desc_free(struct ext4_sb_info *sbi) { struct buffer_head **group_desc; int i; rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) { struct flex_groups **flex_groups; int i; rcu_read_lock(); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; int err; /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } } ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) ext4_commit_super(sb); ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list * isn't empty. The on-disk one can be non-empty if we've * detected an error and taken the fs readonly, but the * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; ext4_stop_mmpd(sbi); brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); } static struct kmem_cache *ext4_inode_cachep; /* * Called inside transaction, so use GFP_NOFS */ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); if (!drop) drop = fscrypt_drop_inode(inode); trace_ext4_drop_inode(inode, drop); return drop; } static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { pr_warn("%s: inode %ld still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } static void ext4_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); dump_stack(); } if (EXT4_I(inode)->i_reserved_data_blocks) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } static void ext4_shutdown(struct super_block *sb) { ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); } static void init_once(void *foo) { struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static int ext4_nfs_commit_metadata(struct inode *inode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL }; trace_ext4_nfs_commit_metadata(inode); return ext4_write_inode(inode, &wbc); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static struct dquot __rcu **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #endif static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif }; static const struct export_operations ext4_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, .commit_metadata = ext4_nfs_commit_metadata, }; enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; static const struct constant_table ext4_param_errors[] = { {"continue", EXT4_MOUNT_ERRORS_CONT}, {"panic", EXT4_MOUNT_ERRORS_PANIC}, {"remount-ro", EXT4_MOUNT_ERRORS_RO}, {} }; static const struct constant_table ext4_param_data[] = { {"journal", EXT4_MOUNT_JOURNAL_DATA}, {"ordered", EXT4_MOUNT_ORDERED_DATA}, {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, {} }; static const struct constant_table ext4_param_data_err[] = { {"abort", Opt_data_err_abort}, {"ignore", Opt_data_err_ignore}, {} }; static const struct constant_table ext4_param_jqfmt[] = { {"vfsold", QFMT_VFS_OLD}, {"vfsv0", QFMT_VFS_V0}, {"vfsv1", QFMT_VFS_V1}, {} }; static const struct constant_table ext4_param_dax[] = { {"always", Opt_dax_always}, {"inode", Opt_dax_inode}, {"never", Opt_dax_never}, {} }; /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the * options and the way we show them in _ext4_show_options(). To * keep the changes to a minimum, let's keep the negative options * separate for now. */ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsddf", Opt_bsd_df), fsparam_flag ("minixdf", Opt_minix_df), fsparam_flag ("grpid", Opt_grpid), fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), fsparam_u32 ("resgid", Opt_resgid), fsparam_u32 ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), fsparam_flag ("debug", Opt_debug), fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), fsparam_flag ("acl", Opt_acl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), fsparam_flag ("nobh", Opt_removed), fsparam_u32 ("commit", Opt_commit), fsparam_u32 ("min_batch_time", Opt_min_batch_time), fsparam_u32 ("max_batch_time", Opt_max_batch_time), fsparam_u32 ("journal_dev", Opt_journal_dev), fsparam_bdev ("journal_path", Opt_journal_path), fsparam_flag ("journal_checksum", Opt_journal_checksum), fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), fsparam_flag ("journal_async_commit",Opt_journal_async_commit), fsparam_flag ("abort", Opt_abort), fsparam_enum ("data", Opt_data, ext4_param_data), fsparam_enum ("data_err", Opt_data_err, ext4_param_data_err), fsparam_string_empty ("usrjquota", Opt_usrjquota), fsparam_string_empty ("grpjquota", Opt_grpjquota), fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), fsparam_flag ("grpquota", Opt_grpquota), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_noquota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("prjquota", Opt_prjquota), fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), fsparam_flag ("delalloc", Opt_delalloc), fsparam_flag ("nodelalloc", Opt_nodelalloc), fsparam_flag ("warn_on_error", Opt_warn_on_error), fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), fsparam_u32 ("debug_want_extra_isize", Opt_debug_want_extra_isize), fsparam_flag ("mblk_io_submit", Opt_removed), fsparam_flag ("nomblk_io_submit", Opt_removed), fsparam_flag ("block_validity", Opt_block_validity), fsparam_flag ("noblock_validity", Opt_noblock_validity), fsparam_u32 ("inode_readahead_blks", Opt_inode_readahead_blks), fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), fsparam_flag ("dioread_nolock", Opt_dioread_nolock), fsparam_flag ("nodioread_nolock", Opt_dioread_lock), fsparam_flag ("dioread_lock", Opt_dioread_lock), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nodiscard", Opt_nodiscard), fsparam_u32 ("init_itable", Opt_init_itable), fsparam_flag ("init_itable", Opt_init_itable), fsparam_flag ("noinit_itable", Opt_noinit_itable), #ifdef CONFIG_EXT4_DEBUG fsparam_flag ("fc_debug_force", Opt_fc_debug_force), fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), #endif fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_flag ("inlinecrypt", Opt_inlinecrypt), fsparam_flag ("nombcache", Opt_nombcache), fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ fsparam_flag ("prefetch_block_bitmaps", Opt_removed), fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ {} }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 #ifdef CONFIG_QUOTA #define MOPT_Q 0 #define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_NO_EXT2 0x0020 #define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_SKIP 0x0080 #define MOPT_2 0x0100 static const struct mount_opts { int token; int mount_opt; int flags; } ext4_mount_opts[] = { {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, {Opt_dax_type, 0, MOPT_EXT4_ONLY}, {Opt_journal_dev, 0, MOPT_NO_EXT2}, {Opt_journal_path, 0, MOPT_NO_EXT2}, {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; #if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; unsigned int version; } ext4_sb_encoding_map[] = { {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct ext4_sb_encodings * ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) return &ext4_sb_encoding_map[i]; return NULL; } #endif #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) #define EXT4_SPEC_s_inode_readahead_blks (1 << 10) #define EXT4_SPEC_s_li_wait_mult (1 << 11) #define EXT4_SPEC_s_max_dir_size_kb (1 << 12) #define EXT4_SPEC_s_stripe (1 << 13) #define EXT4_SPEC_s_resuid (1 << 14) #define EXT4_SPEC_s_resgid (1 << 15) #define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_mb_optimize_scan (1 << 19) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif unsigned short qname_spec; unsigned long vals_s_flags; /* Bits to set in s_flags */ unsigned long mask_s_flags; /* Bits changed in s_flags */ unsigned long journal_devnum; unsigned long s_commit_interval; unsigned long s_stripe; unsigned int s_inode_readahead_blks; unsigned int s_want_extra_isize; unsigned int s_li_wait_mult; unsigned int s_max_dir_size_kb; unsigned int journal_ioprio; unsigned int vals_s_mount_opt; unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; u32 s_min_batch_time; kuid_t s_resuid; kgid_t s_resgid; ext4_fsblk_t s_sb_block; }; static void ext4_fc_free(struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; int i; if (!ctx) return; for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } int ext4_init_fs_context(struct fs_context *fc) { struct ext4_fs_context *ctx; ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &ext4_context_ops; return 0; } #ifdef CONFIG_QUOTA /* * Note the name of the specified quota file. */ static int note_qf_name(struct fs_context *fc, int qtype, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; char *qname; if (param->size < 1) { ext4_msg(NULL, KERN_ERR, "Missing quota name"); return -EINVAL; } if (strchr(param->string, '/')) { ext4_msg(NULL, KERN_ERR, "quotafile must be on filesystem root"); return -EINVAL; } if (ctx->s_qf_names[qtype]) { if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); return -EINVAL; } return 0; } qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { ext4_msg(NULL, KERN_ERR, "Not enough memory for storing quotafile name"); return -ENOMEM; } ctx->s_qf_names[qtype] = qname; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } /* * Clear the name of the specified quota file. */ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; if (ctx->s_qf_names[qtype]) kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } #endif static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, struct ext4_fs_context *ctx) { int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption option not supported"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(param, &ctx->dummy_enc_policy); if (err == -EINVAL) { ext4_msg(NULL, KERN_WARNING, "Value of option \"%s\" is unrecognized", param->key); } else if (err == -EEXIST) { ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return err; } #define EXT4_SET_CTX(name) \ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ } #define EXT4_TEST_CTX(name) \ static inline unsigned long \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ return (ctx->vals_s_##name & flag); \ } EXT4_SET_CTX(flags); /* set only */ EXT4_SET_CTX(mount_opt); EXT4_CLEAR_CTX(mount_opt); EXT4_TEST_CTX(mount_opt); EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; kuid_t uid; kgid_t gid; int token; token = fs_parse(fc, ext4_param_specs, param, &result); if (token < 0) return token; is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; ctx->opt_flags |= m->flags; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { ext4_msg(NULL, KERN_ERR, "%s option not supported", param->key); return 0; } switch (token) { #ifdef CONFIG_QUOTA case Opt_usrjquota: if (!*param->string) return unnote_qf_name(fc, USRQUOTA); else return note_qf_name(fc, USRQUOTA, param); case Opt_grpjquota: if (!*param->string) return unnote_qf_name(fc, GRPQUOTA); else return note_qf_name(fc, GRPQUOTA, param); #endif case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, "Ignoring %s option on remount", param->key); } else { ctx->s_sb_block = result.uint_32; ctx->spec |= EXT4_SPEC_s_sb_block; } return 0; case Opt_removed: ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); #else ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); #endif return 0; case Opt_errors: ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); ctx_set_mount_opt(ctx, result.uint_32); return 0; #ifdef CONFIG_QUOTA case Opt_jqfmt: ctx->s_jquota_fmt = result.uint_32; ctx->spec |= EXT4_SPEC_JQFMT; return 0; #endif case Opt_data: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); ctx_set_mount_opt(ctx, result.uint_32); ctx->spec |= EXT4_SPEC_DATAJ; return 0; case Opt_commit: if (result.uint_32 == 0) result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", result.uint_32, INT_MAX / HZ); return -EINVAL; } ctx->s_commit_interval = HZ * result.uint_32; ctx->spec |= EXT4_SPEC_s_commit_interval; return 0; case Opt_debug_want_extra_isize: if ((result.uint_32 & 1) || (result.uint_32 < 4)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", result.uint_32); return -EINVAL; } ctx->s_want_extra_isize = result.uint_32; ctx->spec |= EXT4_SPEC_s_want_extra_isize; return 0; case Opt_max_batch_time: ctx->s_max_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_batch_time; return 0; case Opt_min_batch_time: ctx->s_min_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_min_batch_time; return 0; case Opt_inode_readahead_blks: if (result.uint_32 && (result.uint_32 > (1 << 30) || !is_power_of_2(result.uint_32))) { ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); return -EINVAL; } ctx->s_inode_readahead_blks = result.uint_32; ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; return 0; case Opt_init_itable: ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (param->type == fs_value_is_string) ctx->s_li_wait_mult = result.uint_32; ctx->spec |= EXT4_SPEC_s_li_wait_mult; return 0; case Opt_max_dir_size_kb: ctx->s_max_dir_size_kb = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; return 0; #ifdef CONFIG_EXT4_DEBUG case Opt_fc_debug_max_replay: ctx->s_fc_debug_max_replay = result.uint_32; ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; return 0; #endif case Opt_stripe: ctx->s_stripe = result.uint_32; ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) { ext4_msg(NULL, KERN_ERR, "Invalid uid value %d", result.uint_32); return -EINVAL; } ctx->s_resuid = uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) { ext4_msg(NULL, KERN_ERR, "Invalid gid value %d", result.uint_32); return -EINVAL; } ctx->s_resgid = gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } ctx->journal_devnum = result.uint_32; ctx->spec |= EXT4_SPEC_JOURNAL_DEV; return 0; case Opt_journal_path: { struct inode *journal_inode; struct path path; int error; if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); return -EINVAL; } journal_inode = d_inode(path.dentry); ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); return 0; } case Opt_journal_ioprio: if (result.uint_32 > 7) { ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); return -EINVAL; } ctx->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX { int type = (token == Opt_dax) ? Opt_dax : result.uint_32; switch (type) { case Opt_dax: case Opt_dax_always: ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } return 0; } #else ext4_msg(NULL, KERN_INFO, "dax option not supported"); return -EINVAL; #endif case Opt_data_err: if (result.uint_32 == Opt_data_err_abort) ctx_set_mount_opt(ctx, m->mount_opt); else if (result.uint_32 == Opt_data_err_ignore) ctx_clear_mount_opt(ctx, m->mount_opt); return 0; case Opt_mb_optimize_scan: if (result.int_32 == 1) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else if (result.int_32 == 0) { ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else { ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); return -EINVAL; } return 0; } /* * At this point we should only be getting options requiring MOPT_SET, * or MOPT_CLEAR. Anything else is a bug */ if (m->token == Opt_err) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } else { unsigned int set = 0; if ((param->type == fs_value_is_flag) || result.uint_32 > 0) set = 1; if (m->flags & MOPT_CLEAR) set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } if (m->flags & MOPT_2) { if (set != 0) ctx_set_mount_opt2(ctx, m->mount_opt); else ctx_clear_mount_opt2(ctx, m->mount_opt); } else { if (set != 0) ctx_set_mount_opt(ctx, m->mount_opt); else ctx_clear_mount_opt(ctx, m->mount_opt); } } return 0; } static int parse_options(struct fs_context *fc, char *options) { struct fs_parameter param; int ret; char *key; if (!options) return 0; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); param.type = fs_value_is_flag; param.string = NULL; if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); param.string = kmemdup_nul(value, v_len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } param.key = key; param.size = v_len; ret = ext4_parse_param(fc, ¶m); if (param.string) kfree(param.string); if (ret < 0) return ret; } } ret = ext4_validate_options(fc); if (ret < 0) return ret; return 0; } static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *s_mount_opts = NULL; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; if (!sbi->s_es->s_mount_opts[0]) return 0; s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), GFP_KERNEL); if (!s_mount_opts) return ret; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) goto out_free; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) goto out_free; fc->fs_private = s_ctx; fc->s_fs_info = sbi; ret = parse_options(fc, s_mount_opts); if (ret < 0) goto parse_failed; ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) { parse_failed: ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); ret = 0; goto out_free; } if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) m_ctx->journal_devnum = s_ctx->journal_devnum; if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; ext4_apply_options(fc, sb); ret = 0; out_free: if (fc) { ext4_fc_free(fc); kfree(fc); } kfree(s_mount_opts); return ret; } static void ext4_apply_quota_options(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA bool quota_feature = ext4_has_feature_quota(sb); struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; int i; if (quota_feature) return; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; qname = ctx->s_qf_names[i]; /* May be NULL */ if (qname) set_opt(sb, QUOTA); ctx->s_qf_names[i] = NULL; qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) kfree_rcu_mightsleep(qname); } } if (ctx->spec & EXT4_SPEC_JQFMT) sbi->s_jquota_fmt = ctx->s_jquota_fmt; #endif } /* * Check quota settings consistency. */ static int ext4_check_quota_consistency(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); bool quota_feature = ext4_has_feature_quota(sb); bool quota_loaded = sb_any_quota_loaded(sb); bool usr_qf_name, grp_qf_name, usrquota, grpquota; int quota_flags, i; /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && !ext4_has_feature_project(sb)) { ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -EINVAL; } quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; if (quota_loaded && ctx->mask_s_mount_opt & quota_flags && !ctx_test_mount_opt(ctx, quota_flags)) goto err_quota_change; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; if (quota_loaded && !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) goto err_jquota_change; if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && strcmp(get_qf_name(sb, sbi, i), ctx->s_qf_names[i]) != 0) goto err_jquota_specified; } if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Journaled quota options ignored when " "QUOTA feature is enabled"); return 0; } } if (ctx->spec & EXT4_SPEC_JQFMT) { if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) goto err_jquota_change; if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Quota format mount options " "ignored when QUOTA feature is enabled"); return 0; } } /* Make sure we don't mix old and new quota format */ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || ctx->s_qf_names[USRQUOTA]); grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || ctx->s_qf_names[GRPQUOTA]); usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || test_opt(sb, USRQUOTA)); grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || test_opt(sb, GRPQUOTA)); if (usr_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); usrquota = false; } if (grp_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); grpquota = false; } if (usr_qf_name || grp_qf_name) { if (usrquota || grpquota) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { ext4_msg(NULL, KERN_ERR, "journaled quota format " "not specified"); return -EINVAL; } } return 0; err_quota_change: ext4_msg(NULL, KERN_ERR, "Cannot change quota options when quota turned on"); return -EINVAL; err_jquota_change: ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " "options when quota turned on"); return -EINVAL; err_jquota_specified: ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(i)); return -EINVAL; #else return 0; #endif } static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption requires encrypt feature"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure s_mount_opts didn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, struct super_block *sb) { if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || /* if already set, it was already verified to be the same */ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) return; EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext2"); return -EINVAL; } if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext3"); return -EINVAL; } if (ctx->s_want_extra_isize > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", ctx->s_want_extra_isize); return -EINVAL; } err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { ext4_msg(NULL, KERN_WARNING, "Remounting file system with no journal " "so ignoring journalled data option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != test_opt(sb, DATA_FLAGS)) { ext4_msg(NULL, KERN_ERR, "Cannot change data mode " "on remount"); return -EINVAL; } } if (is_remount) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { fail_dax_change_remount: ext4_msg(NULL, KERN_ERR, "can't change " "dax mount option while remounting"); return -EINVAL; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { goto fail_dax_change_remount; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } } return ext4_check_quota_consistency(fc, sb); } static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); APPLY(s_max_batch_time); APPLY(s_min_batch_time); APPLY(s_want_extra_isize); APPLY(s_inode_readahead_blks); APPLY(s_max_dir_size_kb); APPLY(s_li_wait_mult); APPLY(s_resgid); APPLY(s_resuid); #ifdef CONFIG_EXT4_DEBUG APPLY(s_fc_debug_max_replay); #endif ext4_apply_quota_options(fc, sb); ext4_apply_test_dummy_encryption(ctx, sb); } static int ext4_validate_options(struct fs_context *fc) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; char *usr_qf_name, *grp_qf_name; usr_qf_name = ctx->s_qf_names[USRQUOTA]; grp_qf_name = ctx->s_qf_names[GRPQUOTA]; if (usr_qf_name || grp_qf_name) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } } #endif return 1; } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; switch (sbi->s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } rcu_read_lock(); usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); if (usr_qf_name) seq_show_option(seq, "usrjquota", usr_qf_name); if (grp_qf_name) seq_show_option(seq, "grpjquota", grp_qf_name); rcu_read_unlock(); #endif } static const char *token2str(int token) { const struct fs_parameter_spec *spec; for (spec = ext4_param_specs; spec->name != NULL; spec++) if (spec->opt == token && !spec->type) break; return spec->name; } /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, int nodefs) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) if (sbi->s_sb_block != 1) SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; int opt_2 = m->flags & MOPT_2; unsigned int mount_opt, def_mount_opt; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; if (opt_2) { mount_opt = sbi->s_mount_opt2; def_mount_opt = sbi->s_def_mount_opt2; } else { mount_opt = sbi->s_mount_opt; def_mount_opt = sbi->s_def_mount_opt; } /* skip if same as the default */ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) continue; /* select Opt_noFoo vs Opt_Foo */ if ((want_set && (mount_opt & m->mount_opt) != m->mount_opt) || (!want_set && (mount_opt & m->mount_opt))) continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) SEQ_OPTS_PUTS("errors=continue"); if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) SEQ_OPTS_PUTS("errors=panic"); if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) SEQ_OPTS_PUTS("data=ordered"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) SEQ_OPTS_PUTS("data=writeback"); } if (nodefs || sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); fscrypt_show_test_dummy_encryption(seq, sep, sb); if (sb->s_flags & SB_INLINECRYPT) SEQ_OPTS_PUTS("inlinecrypt"); if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); else SEQ_OPTS_PUTS("dax=always"); } else if (test_opt2(sb, DAX_NEVER)) { SEQ_OPTS_PUTS("dax=never"); } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && !test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=0"); } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=1"); } ext4_show_quota_options(seq, sb); return 0; } static int ext4_show_options(struct seq_file *seq, struct dentry *root) { return _ext4_show_options(seq, root->d_sb, 0); } int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_puts(seq, "\n"); return rc; } static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; goto done; } if (read_only) goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (ext4_get_tstamp(es, s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; size = ext4_flex_group(sbi, ngroup - 1) + 1; if (size <= sbi->s_flex_groups_allocated) return 0; new_groups = kvzalloc(roundup_pow_of_two(size * sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); return -ENOMEM; } for (i = sbi->s_flex_groups_allocated; i < size; i++) { new_groups[i] = kvzalloc(roundup_pow_of_two( sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { for (j = sbi->s_flex_groups_allocated; j < i; j++) kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); return -ENOMEM; } } rcu_read_lock(); old_groups = rcu_dereference(sbi->s_flex_groups); if (old_groups) memcpy(new_groups, old_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups *))); rcu_read_unlock(); rcu_assign_pointer(sbi->s_flex_groups, new_groups); sbi->s_flex_groups_allocated = size; if (old_groups) ext4_kvfree_array_rcu(old_groups); return 0; } static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct flex_groups *fg; ext4_group_t flex_group; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) goto failed; for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), &fg->free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; failed: return 0; } static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; } /* old crc16 code */ if (!ext4_has_feature_gdt_csum(sb)) return 0; crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, sbi->s_desc_size - offset); out: return cpu_to_le16(crc); } int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; } void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (!ext4_has_group_desc_csum(sb)) return; gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (i == sbi->s_groups_count - 1 || flexbg_flag) last_block = ext4_blocks_count(sbi->s_es) - 1; else last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); if ((grp == sbi->s_groups_count) && !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap >= sb_block + 1 && block_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap >= sb_block + 1 && inode_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_table >= sb_block + 1 && inode_table <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } } ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } if (NULL != first_not_zeroed) *first_not_zeroed = grp; return 1; } /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk * extent format containers, within a sector_t, and within i_blocks * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * * However there is other limiting factor. We do store extents in the form * of starting block and length, hence the resulting length of the extent * covering maximum file size must fit into on-disk format containers as * well. Given that length is always by 1 unit bigger than max unit (because * we count 0 as well) we have to lower the s_maxbytes by one fs block. * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (blkbits - 9); upper_limit <<= blkbits; } /* * 32-bit extent-start container, ee_block. We lower the maxbytes * by one fs block, so ee_len can cover the extent of maximum file * size */ res = (1LL << 32) - 1; res <<= blkbits; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) res = upper_limit; return res; } /* * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; unsigned int ppb = 1 << (bits - 2); /* * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field * represents total file blocks in 2^32 512-byte sectors == * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (bits - 9); } else { /* * We use 48 bit ext4_inode i_blocks * With EXT4_HUGE_FILE_FL set the i_blocks * represent total number of blocks in * file system block size */ upper_limit = (1LL << 48) - 1; } /* Compute how many blocks we can address by block tree */ res += ppb; res += ppb * ppb; res += ((loff_t)ppb) * ppb * ppb; /* Compute how many metadata blocks are needed */ meta_blocks = 1; meta_blocks += 1 + ppb; meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; /* How many metadata blocks are needed for addressing upper_limit? */ upper_limit -= EXT4_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; upper_limit -= ppb; /* double indirect blocks */ if (upper_limit < ppb * ppb) { meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); res -= meta_blocks; goto check_lfs; } meta_blocks += 1 + ppb; upper_limit -= ppb * ppb; /* tripple indirect blocks for the rest */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); res -= meta_blocks; check_lfs: res <<= bits; if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; return res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; /* * If we have a meta_bg fs with 1k blocks, group 0's GDT is at * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled * on modern mke2fs or blksize > 1k on older mke2fs) then we must * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); } /** * ext4_get_stripe_size: Get the stripe size. * @sbi: In memory super block info * * If we have specified it via mount option, then * use the mount option value. If the value specified at mount time is * greater than the blocks per group use the super block value. * If the super block value is greater than blocks per group return 0. * Allocator needs it be less than blocks per group. * */ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) { unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; /* * If the stripe width is 1, this makes no sense and * we set it to 0 to turn off stripe handling code. */ if (ret <= 1) ret = 0; return ret; } /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); return 0; } #if !IS_ENABLED(CONFIG_UNICODE) if (ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } #endif if (readonly) return 1; if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= SB_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); return 0; } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ return 1; } /* * This function is called once a day if we have errors logged * on the file system */ static void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; struct super_block *sb = elr->lr_super; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; } } return ret; } for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; break; } if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } if (group >= ngroups) ret = 1; if (!ret) { start_time = ktime_get_real_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } return ret; } /* * Remove lr_request from the list_request and free the * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { if (!elr) return; list_del(&elr->lr_request); EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } static void ext4_unregister_li_request(struct super_block *sb) { mutex_lock(&ext4_li_mtx); if (!ext4_li_info) { mutex_unlock(&ext4_li_mtx); return; } mutex_lock(&ext4_li_info->li_list_mtx); ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. * When such a fs is found, run the lazy initialization request * (ext4_rn_li_request) and keep track of the time spend in this * function. Based on that time we compute next schedule time of * the request. When walking through the list is complete, compute * next waking time and put itself into sleep. */ static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; BUG_ON(NULL == eli); set_freezable(); cont_thread: while (true) { next_wakeup = MAX_JIFFY_OFFSET; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); goto exit_thread; } list_for_each_safe(pos, n, &eli->li_request_list) { int err = 0; int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); if (time_before(jiffies, elr->lr_next_sched)) { if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { if (sb_start_write_trylock(elr->lr_super)) { progress = 1; /* * We hold sb->s_umount, sb can not * be removed from the list, it is * now safe to drop li_list_mtx */ mutex_unlock(&eli->li_list_mtx); err = ext4_run_li_request(elr); sb_end_write(elr->lr_super); mutex_lock(&eli->li_list_mtx); n = pos->next; } up_read((&elr->lr_super->s_umount)); } /* error, remove the lazy_init job */ if (err) { ext4_remove_li_request(elr); continue; } if (!progress) { elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; if ((time_after_eq(cur, next_wakeup)) || (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } schedule_timeout_interruptible(next_wakeup - cur); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; } } exit_thread: /* * It looks like the request list is empty, but we need * to check it under the li_list_mtx lock, to prevent any * additions into it, and of course we should lock ext4_li_mtx * to atomically free the list and ext4_li_info, because at * this point another ext4 filesystem could be registering * new one. */ mutex_lock(&ext4_li_mtx); mutex_lock(&eli->li_list_mtx); if (!list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); mutex_unlock(&ext4_li_mtx); goto cont_thread; } mutex_unlock(&eli->li_list_mtx); kfree(ext4_li_info); ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); return 0; } static void ext4_clear_request_list(void) { struct list_head *pos, *n; struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); ext4_remove_li_request(elr); } mutex_unlock(&ext4_li_info->li_list_mtx); } static int ext4_run_lazyinit_thread(void) { ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; return 0; } /* * Check whether it make sense to run itable init. thread or not. * If there is at least one uninitialized inode table, return * corresponding group number, else the loop goes through all * groups and return total number of groups. */ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) { ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; if (!ext4_has_group_desc_csum(sb)) return ngroups; for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } return group; } static int ext4_li_info_new(void) { struct ext4_lazy_init *eli = NULL; eli = kzalloc(sizeof(*eli), GFP_KERNEL); if (!eli) return -ENOMEM; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; return 0; } static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) return NULL; elr->lr_super = sb; elr->lr_first_not_zeroed = start; if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; } else { elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* * Randomize first schedule time of the request to * spread the inode table initialization requests * better. */ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; goto out; } if (sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) { ret = -ENOMEM; goto out; } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); if (ret) goto out; } mutex_lock(&ext4_li_info->li_list_mtx); list_add(&elr->lr_request, &ext4_li_info->li_request_list); mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; /* * set elr to NULL here since it has been inserted to * the request_list and the removal and free of it is * handled by ext4_clear_request_list from now on. */ elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); if (ret) goto out; } out: mutex_unlock(&ext4_li_mtx); if (ret) kfree(elr); return ret; } /* * We do not need to lock anything since this is called on * module unload. */ static void ext4_destroy_lazyinit_thread(void) { /* * If thread exited earlier * there's nothing to be done. */ if (!ext4_li_info || !ext4_lazyinit_task) return; kthread_stop(ext4_lazyinit_task); } static int set_journal_csum_feature_set(struct super_block *sb) { int ret = 1; int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; incompat = 0; } jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | incompat); } else if (test_opt(sb, JOURNAL_CHECKSUM)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; } /* * Note: calculating the overhead so we can be compatible with * historical BSD practice is quite difficult in the face of * clusters/bigalloc. This is because multiple metadata blocks from * different block group can end up in the same allocation cluster. * Calculating the exact overhead in the face of clustered allocation * requires either O(all block bitmaps) in memory or O(number of block * groups**2) in time. We will still calculate the superblock for * older file systems --- and if we come across with a bigalloc file * system with zero in s_overhead_clusters the estimate will be close to * correct especially for very large cluster sizes --- but for newer * file systems, it's better to calculate this figure once at mkfs * time, and store it in the superblock. If the superblock value is * present (even for non-bigalloc file systems), we will use it. */ static int count_overhead(struct super_block *sb, ext4_group_t grp, char *buf) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp; ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) return (has_super + ext4_bg_num_gdb(sb, grp) + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); b = ext4_block_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_table(sb, gdp); if (b >= first_block && b + sbi->s_itb_per_group <= last_block) for (j = 0; j < sbi->s_itb_per_group; j++, b++) { int c = EXT4_B2C(sbi, b - first_block); ext4_set_bit(c, buf); count++; } if (i != grp) continue; s = 0; if (ext4_bg_has_super(sb, grp)) { ext4_set_bit(s++, buf); count++; } j = ext4_bg_num_gdb(sb, grp); if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_error(sb, "Invalid number of block group " "descriptor blocks: %d", j); j = EXT4_BLOCKS_PER_GROUP(sb) - s; } count += j; for (; j > 0; j--) ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; return EXT4_CLUSTERS_PER_GROUP(sb) - ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); } /* * Compute the overhead and stash it in sbi->s_overhead */ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct inode *j_inode; unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; /* * Compute the overhead (FS structures). This is constant * for a given filesystem unless the number of block groups * changes so we cache the previous value until it does. */ /* * All of the blocks before first_data_block are overhead */ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { int blks; blks = count_overhead(sb, i, buf); overhead += blks; if (blks) memset(buf, 0, PAGE_SIZE); cond_resched(); } /* * Add the internal journal blocks whether the journal has been * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); } else { ext4_msg(sb, KERN_ERR, "can't get journal size"); } } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); return 0; } static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. * The space estimates are exact, there are no unwritten extents, * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ if (!ext4_has_feature_extents(sb)) return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ resv_clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static const char *ext4_quota_mode(struct super_block *sb) { #ifdef CONFIG_QUOTA if (!ext4_quota_capable(sb)) return "none"; if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) return "journalled"; else return "writeback"; #else return "disabled"; #endif } static void ext4_setup_csum_trigger(struct super_block *sb, enum ext4_journal_trigger_type type, void (*trigger)( struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size)) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_journal_triggers[type].sb = sb; sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } static void ext4_free_sbi(struct ext4_sb_info *sbi) { if (!sbi) return; kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) { struct ext4_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) goto err_out; sb->s_fs_info = sbi; sbi->s_sb = sb; return sbi; err_out: fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } static void ext4_set_def_opts(struct super_block *sb, struct ext4_super_block *es) { unsigned long def_mount_opts; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); /* block_validity enabled by default; disable with noblock_validity */ set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int clustersize; /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); if (ext4_has_feature_bigalloc(sb)) { if (clustersize < sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "fragment/cluster size (%d) != " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#blocks per group too big: %lu", sbi->s_blocks_per_group); return -EINVAL; } sbi->s_cluster_bits = 0; } sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", sbi->s_clusters_per_group); return -EINVAL; } if (sbi->s_blocks_per_group != (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and clusters per group (%lu) inconsistent", sbi->s_blocks_per_group, sbi->s_clusters_per_group); return -EINVAL; } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); return 0; } static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; sbi->s_fc_replay_state.fc_regions_valid = 0; sbi->s_fc_replay_state.fc_modified_inodes = NULL; sbi->s_fc_replay_state.fc_modified_inodes_size = 0; sbi->s_fc_replay_state.fc_modified_inodes_used = 0; } static int ext4_inode_info_init(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); return -EINVAL; } /* * i_atime_extra is the last extra field available for * [acm]times in struct ext4_inode. Checking for that * field should suffice to ensure we have extra space * for all three. */ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { unsigned v, max = (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE); v = le16_to_cpu(es->s_want_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; v = le16_to_cpu(es->s_min_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } return 0; } #if IS_ENABLED(CONFIG_UNICODE) static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); if (!ext4_has_feature_casefold(sb) || sb->s_encoding) return 0; encoding_info = ext4_sb_read_encoding(es); if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return -EINVAL; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; return 0; } #else static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { return 0; } #endif static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Warn if metadata_csum and gdt_csum are both set. */ if (ext4_has_feature_metadata_csum(sb) && ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "unknown checksum algorithm."); return -EINVAL; } ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { int ret = PTR_ERR(sbi->s_chksum_driver); ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); sbi->s_chksum_driver = NULL; return ret; } /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); return -EFSBADCRC; } /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); return -EINVAL; } /* * ea_inode feature uses l_i_version field which is not * available in HURD_COMPAT mode. */ if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); return -EINVAL; } } if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); return -EINVAL; } } if (IS_EXT3_SB(sb)) { if (ext3_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); return -EINVAL; } } /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); return -EINVAL; } if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); return -EINVAL; } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); return -EINVAL; } return 0; } static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; int err; if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); return -EINVAL; } /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); return err; } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); return -EINVAL; } /* * It makes no sense for the first data block to be beyond the end * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); return -EINVAL; } if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && (sbi->s_cluster_ratio == 1)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block is 0 with a 1k block and cluster size"); return -EINVAL; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); return -EINVAL; } sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", le32_to_cpu(es->s_inodes_count), ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); return -EINVAL; } return 0; } static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", le32_to_cpu(es->s_first_meta_bg), db_count); return -EINVAL; } } rcu_assign_pointer(sbi->s_group_desc, kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); return -ENOMEM; } bgl_lock_init(sbi->s_blockgroup_lock); /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); bh = ext4_sb_bread_unmovable(sb, block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); return -EFSCORRUPTED; } return 0; } static int ext4_load_and_init_journal(struct super_block *sb, struct ext4_super_block *es, struct ext4_fs_context *ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) return err; if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto out; } if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); goto out; } if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { ext4_msg(sb, KERN_ERR, "Failed to set fast commit journal feature"); goto out; } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { case 0: /* No mode set, assume a default based on the journal * capabilities: ORDERED_DATA if the journal can * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; } else { set_opt(sb, JOURNAL_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; } break; case EXT4_MOUNT_ORDERED_DATA: case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); goto out; } break; default: break; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); goto out; } set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = ext4_journal_finish_inode_data_buffers; return 0; out: /* flush s_sb_upd_work before destroying the journal. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; } static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " "data=journal disables delayed allocation, " "dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); return -EINVAL; } if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ext4_has_feature_encrypt(sb)) { ext4_msg(sb, KERN_WARNING, "encrypted files will use data=ordered " "instead of data journaling mode"); } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { sb->s_iflags |= SB_I_CGROUPWB; } return 0; } static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es; ext4_fsblk_t logical_sb_block; unsigned long offset = 0; struct buffer_head *bh; int ret = -EINVAL; int blocksize; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); return -EINVAL; } /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); return PTR_ERR(bh); } /* * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto out; } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log block size: %u", le32_to_cpu(es->s_log_block_size)); goto out; } if (le32_to_cpu(es->s_log_cluster_size) > (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log cluster size: %u", le32_to_cpu(es->s_log_cluster_size)); goto out; } blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); /* * If the default block size is not the same as the real block size, * we need to reload it. */ if (sb->s_blocksize == blocksize) { *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; } /* * bh must be released before kill_bdev(), otherwise * it won't be freed and its page also. kill_bdev() * is called by sb_set_blocksize(). */ brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); bh = NULL; goto out; } logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); ret = PTR_ERR(bh); bh = NULL; goto out; } es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; out: brelse(bh); return ret; } static void ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif } } } static int ext4_block_group_meta_init(struct super_block *sb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int has_huge_files; has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { ext4_msg(sb, KERN_ERR, "unsupported descriptor size %lu", sbi->s_desc_size); return -EINVAL; } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); return -EINVAL; } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); return -EINVAL; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); return 0; } static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; int needs_recovery; int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; es = sbi->s_es; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); err = ext4_init_metadata_csum(sb, es); if (err) goto failed_mount; ext4_set_def_opts(sb, es); sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; /* * set default s_li_wait_mult for lazyinit, for the case there is * no mount option specified. */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; err = ext4_inode_info_init(sb, es); if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); if (err < 0) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) goto failed_mount; ext4_apply_options(fc, sb); err = ext4_encoding_init(sb, es); if (err) goto failed_mount; err = ext4_check_journal_data_mode(sb); if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; err = ext4_block_group_meta_init(sb, silent); if (err) goto failed_mount; ext4_hash_info_init(sb); err = ext4_handle_clustersize(sb); if (err) goto failed_mount; err = ext4_check_geometry(sb, es); if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) goto failed_mount3; err = ext4_es_register_shrinker(sbi); if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); /* * It's hard to get stripe aligned blocks if stripe is not aligned with * cluster, just disable stripe and alert user to simpfy code and avoid * stripe aligned allocation which will rarely successes. */ if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && sbi->s_stripe % sbi->s_cluster_ratio != 0) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", sbi->s_stripe, sbi->s_cluster_ratio); sbi->s_stripe = 0; } sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode */ sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &ext4_verityops; #endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); ext4_fast_commit_init(sb); sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto failed_mount3a; } err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", |