39 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Trace point definitions for the RDMA Connect Manager. * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rdma_cma #if !defined(_TRACE_RDMA_CMA_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RDMA_CMA_H #include <linux/tracepoint.h> #include <trace/misc/rdma.h> DECLARE_EVENT_CLASS(cma_fsm_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos ) ); #define DEFINE_CMA_FSM_EVENT(name) \ DEFINE_EVENT(cma_fsm_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_FSM_EVENT(send_rtu); DEFINE_CMA_FSM_EVENT(send_rej); DEFINE_CMA_FSM_EVENT(send_mra); DEFINE_CMA_FSM_EVENT(send_sidr_req); DEFINE_CMA_FSM_EVENT(send_sidr_rep); DEFINE_CMA_FSM_EVENT(disconnect); DEFINE_CMA_FSM_EVENT(sent_drep); DEFINE_CMA_FSM_EVENT(sent_dreq); DEFINE_CMA_FSM_EVENT(id_destroy); TRACE_EVENT(cm_id_attach, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_device *device ), TP_ARGS(id_priv, device), TP_STRUCT__entry( __field(u32, cm_id) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) __string(devname, device->name) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); __assign_str(devname); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __get_str(devname) ) ); DECLARE_EVENT_CLASS(cma_qp_class, TP_PROTO( const struct rdma_id_private *id_priv ), TP_ARGS(id_priv), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(u32, qp_num) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->qp_num = id_priv->qp_num; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u qp_num=%u", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->qp_num ) ); #define DEFINE_CMA_QP_EVENT(name) \ DEFINE_EVENT(cma_qp_class, cm_##name, \ TP_PROTO( \ const struct rdma_id_private *id_priv \ ), \ TP_ARGS(id_priv)) DEFINE_CMA_QP_EVENT(send_req); DEFINE_CMA_QP_EVENT(send_rep); DEFINE_CMA_QP_EVENT(qp_destroy); /* * enum ib_wp_type, from include/rdma/ib_verbs.h */ #define IB_QP_TYPE_LIST \ ib_qp_type(SMI) \ ib_qp_type(GSI) \ ib_qp_type(RC) \ ib_qp_type(UC) \ ib_qp_type(UD) \ ib_qp_type(RAW_IPV6) \ ib_qp_type(RAW_ETHERTYPE) \ ib_qp_type(RAW_PACKET) \ ib_qp_type(XRC_INI) \ ib_qp_type_end(XRC_TGT) #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) TRACE_DEFINE_ENUM(IB_QPT_##x); #define ib_qp_type_end(x) TRACE_DEFINE_ENUM(IB_QPT_##x); IB_QP_TYPE_LIST #undef ib_qp_type #undef ib_qp_type_end #define ib_qp_type(x) { IB_QPT_##x, #x }, #define ib_qp_type_end(x) { IB_QPT_##x, #x } #define rdma_show_qp_type(x) \ __print_symbolic(x, IB_QP_TYPE_LIST) TRACE_EVENT(cm_qp_create, TP_PROTO( const struct rdma_id_private *id_priv, const struct ib_pd *pd, const struct ib_qp_init_attr *qp_init_attr, int rc ), TP_ARGS(id_priv, pd, qp_init_attr, rc), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, pd_id) __field(u32, tos) __field(u32, qp_num) __field(u32, send_wr) __field(u32, recv_wr) __field(int, rc) __field(unsigned long, qp_type) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->pd_id = pd->res.id; __entry->tos = id_priv->tos; __entry->send_wr = qp_init_attr->cap.max_send_wr; __entry->recv_wr = qp_init_attr->cap.max_recv_wr; __entry->rc = rc; if (!rc) { __entry->qp_num = id_priv->qp_num; __entry->qp_type = id_priv->id.qp_type; } else { __entry->qp_num = 0; __entry->qp_type = 0; } memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u pd.id=%u qp_type=%s" " send_wr=%u recv_wr=%u qp_num=%u rc=%d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, __entry->pd_id, rdma_show_qp_type(__entry->qp_type), __entry->send_wr, __entry->recv_wr, __entry->qp_num, __entry->rc ) ); TRACE_EVENT(cm_req_handler, TP_PROTO( const struct rdma_id_private *id_priv, int event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_ib_cm_event(__entry->event), __entry->event ) ); TRACE_EVENT(cm_event_handler, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event ), TP_ARGS(id_priv, event), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, status) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->status = event->status; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu/%d)", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->event, __entry->status ) ); TRACE_EVENT(cm_event_done, TP_PROTO( const struct rdma_id_private *id_priv, const struct rdma_cm_event *event, int result ), TP_ARGS(id_priv, event, result), TP_STRUCT__entry( __field(u32, cm_id) __field(u32, tos) __field(unsigned long, event) __field(int, result) __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; __entry->tos = id_priv->tos; __entry->event = event->event; __entry->result = result; memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, sizeof(struct sockaddr_in6)); memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, sizeof(struct sockaddr_in6)); ), TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s consumer returns %d", __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos, rdma_show_cm_event(__entry->event), __entry->result ) ); DECLARE_EVENT_CLASS(cma_client_class, TP_PROTO( const struct ib_device *device ), TP_ARGS(device), TP_STRUCT__entry( __string(name, device->name) ), TP_fast_assign( __assign_str(name); ), TP_printk("device name=%s", __get_str(name) ) ); #define DEFINE_CMA_CLIENT_EVENT(name) \ DEFINE_EVENT(cma_client_class, cm_##name, \ TP_PROTO( \ const struct ib_device *device \ ), \ TP_ARGS(device)) DEFINE_CMA_CLIENT_EVENT(add_one); DEFINE_CMA_CLIENT_EVENT(remove_one); #endif /* _TRACE_RDMA_CMA_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE cma_trace #include <trace/define_trace.h> |
5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; } |
1 8 10 10 6 10 9 9 16 16 15 9 16 13 13 11 13 13 1 1 1 12 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 | // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include <net/inet_dscp.h> #include <net/ip.h> #include "ipvlan.h" static u32 ipvlan_jhash_secret __read_mostly; void ipvlan_init_secret(void) { net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret)); } void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast) { if (likely(success)) { struct ipvl_pcpu_stats *pcptr; pcptr = this_cpu_ptr(ipvlan->pcpu_stats); u64_stats_update_begin(&pcptr->syncp); u64_stats_inc(&pcptr->rx_pkts); u64_stats_add(&pcptr->rx_bytes, len); if (mcast) u64_stats_inc(&pcptr->rx_mcast); u64_stats_update_end(&pcptr->syncp); } else { this_cpu_inc(ipvlan->pcpu_stats->rx_errs); } } EXPORT_SYMBOL_GPL(ipvlan_count_rx); #if IS_ENABLED(CONFIG_IPV6) static u8 ipvlan_get_v6_hash(const void *iaddr) { const struct in6_addr *ip6_addr = iaddr; return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } #else static u8 ipvlan_get_v6_hash(const void *iaddr) { return 0; } #endif static u8 ipvlan_get_v4_hash(const void *iaddr) { const struct in_addr *ip4_addr = iaddr; return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & IPVLAN_HASH_MASK; } static bool addr_equal(bool is_v6, struct ipvl_addr *addr, const void *iaddr) { if (!is_v6 && addr->atype == IPVL_IPV4) { struct in_addr *i4addr = (struct in_addr *)iaddr; return addr->ip4addr.s_addr == i4addr->s_addr; #if IS_ENABLED(CONFIG_IPV6) } else if (is_v6 && addr->atype == IPVL_IPV6) { struct in6_addr *i6addr = (struct in6_addr *)iaddr; return ipv6_addr_equal(&addr->ip6addr, i6addr); #endif } return false; } static struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, const void *iaddr, bool is_v6) { struct ipvl_addr *addr; u8 hash; hash = is_v6 ? ipvlan_get_v6_hash(iaddr) : ipvlan_get_v4_hash(iaddr); hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) if (addr_equal(is_v6, addr, iaddr)) return addr; return NULL; } void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr) { struct ipvl_port *port = ipvlan->port; u8 hash; hash = (addr->atype == IPVL_IPV6) ? ipvlan_get_v6_hash(&addr->ip6addr) : ipvlan_get_v4_hash(&addr->ip4addr); if (hlist_unhashed(&addr->hlnode)) hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]); } void ipvlan_ht_addr_del(struct ipvl_addr *addr) { hlist_del_init_rcu(&addr->hlnode); } struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6) { struct ipvl_addr *addr, *ret = NULL; rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) { if (addr_equal(is_v6, addr, iaddr)) { ret = addr; break; } } rcu_read_unlock(); return ret; } bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6) { struct ipvl_dev *ipvlan; bool ret = false; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (ipvlan_find_addr(ipvlan, iaddr, is_v6)) { ret = true; break; } } rcu_read_unlock(); return ret; } void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) { void *lyr3h = NULL; switch (skb->protocol) { case htons(ETH_P_ARP): { struct arphdr *arph; if (unlikely(!pskb_may_pull(skb, arp_hdr_len(port->dev)))) return NULL; arph = arp_hdr(skb); *type = IPVL_ARP; lyr3h = arph; break; } case htons(ETH_P_IP): { u32 pktlen; struct iphdr *ip4h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h)))) return NULL; ip4h = ip_hdr(skb); pktlen = skb_ip_totlen(skb); if (ip4h->ihl < 5 || ip4h->version != 4) return NULL; if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) return NULL; *type = IPVL_IPV4; lyr3h = ip4h; break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): { struct ipv6hdr *ip6h; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) return NULL; ip6h = ipv6_hdr(skb); if (ip6h->version != 6) return NULL; *type = IPVL_IPV6; lyr3h = ip6h; /* Only Neighbour Solicitation pkts need different treatment */ if (ipv6_addr_any(&ip6h->saddr) && ip6h->nexthdr == NEXTHDR_ICMP) { struct icmp6hdr *icmph; if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); if (icmph->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { /* Need to access the ipv6 address in body */ if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph) + sizeof(struct in6_addr)))) return NULL; ip6h = ipv6_hdr(skb); icmph = (struct icmp6hdr *)(ip6h + 1); } *type = IPVL_ICMPV6; lyr3h = icmph; } break; } #endif default: return NULL; } return lyr3h; } unsigned int ipvlan_mac_hash(const unsigned char *addr) { u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2), ipvlan_jhash_secret); return hash & IPVLAN_MAC_FILTER_MASK; } void ipvlan_process_multicast(struct work_struct *work) { struct ipvl_port *port = container_of(work, struct ipvl_port, wq); struct ethhdr *ethh; struct ipvl_dev *ipvlan; struct sk_buff *skb, *nskb; struct sk_buff_head list; unsigned int len; unsigned int mac_hash; int ret; u8 pkt_type; bool tx_pkt; __skb_queue_head_init(&list); spin_lock_bh(&port->backlog.lock); skb_queue_splice_tail_init(&port->backlog, &list); spin_unlock_bh(&port->backlog.lock); while ((skb = __skb_dequeue(&list)) != NULL) { struct net_device *dev = skb->dev; bool consumed = false; ethh = eth_hdr(skb); tx_pkt = IPVL_SKB_CB(skb)->tx_pkt; mac_hash = ipvlan_mac_hash(ethh->h_dest); if (ether_addr_equal(ethh->h_dest, port->dev->broadcast)) pkt_type = PACKET_BROADCAST; else pkt_type = PACKET_MULTICAST; rcu_read_lock(); list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { if (tx_pkt && (ipvlan->dev == skb->dev)) continue; if (!test_bit(mac_hash, ipvlan->mac_filters)) continue; if (!(ipvlan->dev->flags & IFF_UP)) continue; ret = NET_RX_DROP; len = skb->len + ETH_HLEN; nskb = skb_clone(skb, GFP_ATOMIC); local_bh_disable(); if (nskb) { consumed = true; nskb->pkt_type = pkt_type; nskb->dev = ipvlan->dev; if (tx_pkt) ret = dev_forward_skb(ipvlan->dev, nskb); else ret = netif_rx(nskb); } ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true); local_bh_enable(); } rcu_read_unlock(); if (tx_pkt) { /* If the packet originated here, send it out. */ skb->dev = port->dev; skb->pkt_type = pkt_type; dev_queue_xmit(skb); } else { if (consumed) consume_skb(skb); else kfree_skb(skb); } dev_put(dev); cond_resched(); } } static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev) { bool xnet = true; if (dev) xnet = !net_eq(dev_net(skb->dev), dev_net(dev)); skb_scrub_packet(skb, xnet); if (dev) skb->dev = dev; } static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, bool local) { struct ipvl_dev *ipvlan = addr->master; struct net_device *dev = ipvlan->dev; unsigned int len; rx_handler_result_t ret = RX_HANDLER_CONSUMED; bool success = false; struct sk_buff *skb = *pskb; len = skb->len + ETH_HLEN; /* Only packets exchanged between two local slaves need to have * device-up check as well as skb-share check. */ if (local) { if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto out; } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; *pskb = skb; } if (local) { skb->pkt_type = PACKET_HOST; if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) success = true; } else { skb->dev = dev; ret = RX_HANDLER_ANOTHER; success = true; } out: ipvlan_count_rx(ipvlan, len, success, false); return ret; } struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest) { struct ipvl_addr *addr = NULL; switch (addr_type) { #if IS_ENABLED(CONFIG_IPV6) case IPVL_IPV6: { struct ipv6hdr *ip6h; struct in6_addr *i6addr; ip6h = (struct ipv6hdr *)lyr3h; i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr; addr = ipvlan_ht_addr_lookup(port, i6addr, true); break; } case IPVL_ICMPV6: { struct nd_msg *ndmh; struct in6_addr *i6addr; /* Make sure that the NeighborSolicitation ICMPv6 packets * are handled to avoid DAD issue. */ ndmh = (struct nd_msg *)lyr3h; if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { i6addr = &ndmh->target; addr = ipvlan_ht_addr_lookup(port, i6addr, true); } break; } #endif case IPVL_IPV4: { struct iphdr *ip4h; __be32 *i4addr; ip4h = (struct iphdr *)lyr3h; i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr; addr = ipvlan_ht_addr_lookup(port, i4addr, false); break; } case IPVL_ARP: { struct arphdr *arph; unsigned char *arp_ptr; __be32 dip; arph = (struct arphdr *)lyr3h; arp_ptr = (unsigned char *)(arph + 1); if (use_dest) arp_ptr += (2 * port->dev->addr_len) + 4; else arp_ptr += port->dev->addr_len; memcpy(&dip, arp_ptr, 4); addr = ipvlan_ht_addr_lookup(port, &dip, false); break; } } return addr; } static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { const struct iphdr *ip4h = ip_hdr(skb); struct net_device *dev = skb->dev; struct net *net = dev_net(dev); struct rtable *rt; int err, ret = NET_XMIT_DROP; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .daddr = ip4h->daddr, .saddr = ip4h->saddr, }; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto err; } skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); err = ip_local_out(net, NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out; err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static noinline_for_stack int ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; struct dst_entry *dst; int err; dst = ip6_route_output(dev_net(dev), NULL, &fl6); err = dst->error; if (err) { dst_release(dst); return err; } skb_dst_set(skb, dst); return 0; } static int ipvlan_process_v6_outbound(struct sk_buff *skb) { struct net_device *dev = skb->dev; int err, ret = NET_XMIT_DROP; err = ipvlan_route_v6_outbound(dev, skb); if (unlikely(err)) { DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return err; } memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); err = ip6_local_out(dev_net(dev), NULL, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; return ret; } #else static int ipvlan_process_v6_outbound(struct sk_buff *skb) { return NET_XMIT_DROP; } #endif static int ipvlan_process_outbound(struct sk_buff *skb) { int ret = NET_XMIT_DROP; /* The ipvlan is a pseudo-L2 device, so the packets that we receive * will have L2; which need to discarded and processed further * in the net-ns of the main-device. */ if (skb_mac_header_was_set(skb)) { /* In this mode we dont care about * multicast and broadcast traffic */ struct ethhdr *ethh = eth_hdr(skb); if (is_multicast_ether_addr(ethh->h_dest)) { pr_debug_ratelimited( "Dropped {multi|broad}cast of type=[%x]\n", ntohs(skb->protocol)); kfree_skb(skb); goto out; } skb_pull(skb, sizeof(*ethh)); skb->mac_header = (typeof(skb->mac_header))~0U; skb_reset_network_header(skb); } if (skb->protocol == htons(ETH_P_IPV6)) ret = ipvlan_process_v6_outbound(skb); else if (skb->protocol == htons(ETH_P_IP)) ret = ipvlan_process_v4_outbound(skb); else { pr_warn_ratelimited("Dropped outbound packet type=%x\n", ntohs(skb->protocol)); kfree_skb(skb); } out: return ret; } static void ipvlan_multicast_enqueue(struct ipvl_port *port, struct sk_buff *skb, bool tx_pkt) { if (skb->protocol == htons(ETH_P_PAUSE)) { kfree_skb(skb); return; } /* Record that the deferred packet is from TX or RX path. By * looking at mac-addresses on packet will lead to erronus decisions. * (This would be true for a loopback-mode on master device or a * hair-pin mode of the switch.) */ IPVL_SKB_CB(skb)->tx_pkt = tx_pkt; spin_lock(&port->backlog.lock); if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) { dev_hold(skb->dev); __skb_queue_tail(&port->backlog, skb); spin_unlock(&port->backlog.lock); schedule_work(&port->wq); } else { spin_unlock(&port->backlog.lock); dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb(skb); } } static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); void *lyr3h; struct ipvl_addr *addr; int addr_type; lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (!lyr3h) goto out; if (!ipvlan_is_vepa(ipvlan->port)) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } out: ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); return ipvlan_process_outbound(skb); } static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) { const struct ipvl_dev *ipvlan = netdev_priv(dev); struct ethhdr *eth = skb_eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (!ipvlan_is_vepa(ipvlan->port) && ether_addr_equal(eth->h_dest, eth->h_source)) { lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); if (lyr3h) { addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); if (addr) { if (ipvlan_is_private(ipvlan->port)) { consume_skb(skb); return NET_XMIT_DROP; } ipvlan_rcv_frame(addr, &skb, true); return NET_XMIT_SUCCESS; } } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_XMIT_DROP; /* Packet definitely does not belong to any of the * virtual devices, but the dest is local. So forward * the skb for the main-dev. At the RX side we just return * RX_PASS for it to be processed further on the stack. */ dev_forward_skb(ipvlan->phy_dev, skb); return NET_XMIT_SUCCESS; } else if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); ipvlan_skb_crossing_ns(skb, NULL); ipvlan_multicast_enqueue(ipvlan->port, skb, true); return NET_XMIT_SUCCESS; } skb->dev = ipvlan->phy_dev; return dev_queue_xmit(skb); } int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev); if (!port) goto out; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) goto out; switch(port->mode) { case IPVLAN_MODE_L2: return ipvlan_xmit_mode_l2(skb, dev); case IPVLAN_MODE_L3: #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: #endif return ipvlan_xmit_mode_l3(skb, dev); } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); out: kfree_skb(skb); return NET_XMIT_DROP; } static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port) { struct ethhdr *eth = eth_hdr(skb); struct ipvl_addr *addr; void *lyr3h; int addr_type; if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) { lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) return true; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false); if (addr) return false; } return true; } static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, struct ipvl_port *port) { void *lyr3h; int addr_type; struct ipvl_addr *addr; struct sk_buff *skb = *pskb; rx_handler_result_t ret = RX_HANDLER_PASS; lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); if (addr) ret = ipvlan_rcv_frame(addr, pskb, false); out: return ret; } static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, struct ipvl_port *port) { struct sk_buff *skb = *pskb; struct ethhdr *eth = eth_hdr(skb); rx_handler_result_t ret = RX_HANDLER_PASS; if (is_multicast_ether_addr(eth->h_dest)) { if (ipvlan_external_frame(skb, port)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); /* External frames are queued for device local * distribution, but a copy is given to master * straight away to avoid sending duplicates later * when work-queue processes this frame. This is * achieved by returning RX_HANDLER_PASS. */ if (nskb) { ipvlan_skb_crossing_ns(nskb, NULL); ipvlan_multicast_enqueue(port, nskb, false); } } } else { /* Perform like l3 mode for non-multicast packet */ ret = ipvlan_handle_mode_l3(pskb, port); } return ret; } rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev); if (!port) return RX_HANDLER_PASS; switch (port->mode) { case IPVLAN_MODE_L2: return ipvlan_handle_mode_l2(pskb, port); case IPVLAN_MODE_L3: return ipvlan_handle_mode_l3(pskb, port); #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: return RX_HANDLER_PASS; #endif } /* Should not reach here */ WARN_ONCE(true, "%s called for mode = [%x]\n", __func__, port->mode); kfree_skb(skb); return RX_HANDLER_CONSUMED; } |
26 27 27 1 1 2 2 2 3 3 1 1 874 822 55 30 2 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 | // SPDX-License-Identifier: GPL-2.0-or-later /* * "LAPB via ethernet" driver release 001 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * This is a "pseudo" network driver to allow LAPB over Ethernet. * * This driver can use any ethernet destination address, and can be * limited to accept frames from one dedicated ethernet card only. * * History * LAPBETH 001 Jonathan Naylor Cloned from bpqether.c * 2000-10-29 Henner Eisen lapb_data_indication() return status. * 2000-11-14 Henner Eisen dev_hold/put, NETDEV_GOING_DOWN support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/stat.h> #include <linux/module.h> #include <linux/lapb.h> #include <linux/init.h> #include <net/x25device.h> static const u8 bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; /* If this number is made larger, check that the temporary string buffer * in lapbeth_new_device is large enough to store the probe device name. */ #define MAXLAPBDEV 100 struct lapbethdev { struct list_head node; struct net_device *ethdev; /* link to ethernet device */ struct net_device *axdev; /* lapbeth device (lapb#) */ bool up; spinlock_t up_lock; /* Protects "up" */ struct sk_buff_head rx_queue; struct napi_struct napi; }; static LIST_HEAD(lapbeth_devices); static void lapbeth_connected(struct net_device *dev, int reason); static void lapbeth_disconnected(struct net_device *dev, int reason); /* ------------------------------------------------------------------------ */ /* Get the LAPB device for the ethernet device */ static struct lapbethdev *lapbeth_get_x25_dev(struct net_device *dev) { struct lapbethdev *lapbeth; list_for_each_entry_rcu(lapbeth, &lapbeth_devices, node, lockdep_rtnl_is_held()) { if (lapbeth->ethdev == dev) return lapbeth; } return NULL; } static __inline__ int dev_is_ethdev(struct net_device *dev) { return dev->type == ARPHRD_ETHER && strncmp(dev->name, "dummy", 5); } /* ------------------------------------------------------------------------ */ static int lapbeth_napi_poll(struct napi_struct *napi, int budget) { struct lapbethdev *lapbeth = container_of(napi, struct lapbethdev, napi); struct sk_buff *skb; int processed = 0; for (; processed < budget; ++processed) { skb = skb_dequeue(&lapbeth->rx_queue); if (!skb) break; netif_receive_skb_core(skb); } if (processed < budget) napi_complete(napi); return processed; } /* Receive a LAPB frame via an ethernet interface. */ static int lapbeth_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { int len, err; struct lapbethdev *lapbeth; if (dev_net(dev) != &init_net) goto drop; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (!pskb_may_pull(skb, 2)) goto drop; rcu_read_lock(); lapbeth = lapbeth_get_x25_dev(dev); if (!lapbeth) goto drop_unlock_rcu; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop_unlock; len = skb->data[0] + skb->data[1] * 256; dev->stats.rx_packets++; dev->stats.rx_bytes += len; skb_pull(skb, 2); /* Remove the length bytes */ skb_trim(skb, len); /* Set the length of the data */ err = lapb_data_received(lapbeth->axdev, skb); if (err != LAPB_OK) { printk(KERN_DEBUG "lapbether: lapb_data_received err - %d\n", err); goto drop_unlock; } out: spin_unlock_bh(&lapbeth->up_lock); rcu_read_unlock(); return 0; drop_unlock: kfree_skb(skb); goto out; drop_unlock_rcu: rcu_read_unlock(); drop: kfree_skb(skb); return 0; } static int lapbeth_data_indication(struct net_device *dev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; if (skb_cow(skb, 1)) { kfree_skb(skb); return NET_RX_DROP; } skb_push(skb, 1); ptr = skb->data; *ptr = X25_IFACE_DATA; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); return NET_RX_SUCCESS; } /* Send a LAPB frame via an ethernet interface */ static netdev_tx_t lapbeth_xmit(struct sk_buff *skb, struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); if (!lapbeth->up) goto drop; /* There should be a pseudo header of 1 byte added by upper layers. * Check to make sure it is there before reading it. */ if (skb->len < 1) goto drop; switch (skb->data[0]) { case X25_IFACE_DATA: break; case X25_IFACE_CONNECT: err = lapb_connect_request(dev); if (err == LAPB_CONNECTED) lapbeth_connected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_connect_request error: %d\n", err); goto drop; case X25_IFACE_DISCONNECT: err = lapb_disconnect_request(dev); if (err == LAPB_NOTCONNECTED) lapbeth_disconnected(dev, LAPB_OK); else if (err != LAPB_OK) pr_err("lapb_disconnect_request err: %d\n", err); fallthrough; default: goto drop; } skb_pull(skb, 1); err = lapb_data_request(dev, skb); if (err != LAPB_OK) { pr_err("lapb_data_request error - %d\n", err); goto drop; } out: spin_unlock_bh(&lapbeth->up_lock); return NETDEV_TX_OK; drop: kfree_skb(skb); goto out; } static void lapbeth_data_transmit(struct net_device *ndev, struct sk_buff *skb) { struct lapbethdev *lapbeth = netdev_priv(ndev); unsigned char *ptr; struct net_device *dev; int size = skb->len; ptr = skb_push(skb, 2); *ptr++ = size % 256; *ptr++ = size / 256; ndev->stats.tx_packets++; ndev->stats.tx_bytes += size; skb->dev = dev = lapbeth->ethdev; skb->protocol = htons(ETH_P_DEC); skb_reset_network_header(skb); dev_hard_header(skb, dev, ETH_P_DEC, bcast_addr, NULL, 0); dev_queue_xmit(skb); } static void lapbeth_connected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_CONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } static void lapbeth_disconnected(struct net_device *dev, int reason) { struct lapbethdev *lapbeth = netdev_priv(dev); unsigned char *ptr; struct sk_buff *skb = __dev_alloc_skb(1, GFP_ATOMIC | __GFP_NOMEMALLOC); if (!skb) return; ptr = skb_put(skb, 1); *ptr = X25_IFACE_DISCONNECT; skb->protocol = x25_type_trans(skb, dev); skb_queue_tail(&lapbeth->rx_queue, skb); napi_schedule(&lapbeth->napi); } /* Set AX.25 callsign */ static int lapbeth_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; dev_addr_set(dev, sa->sa_data); return 0; } static const struct lapb_register_struct lapbeth_callbacks = { .connect_confirmation = lapbeth_connected, .connect_indication = lapbeth_connected, .disconnect_confirmation = lapbeth_disconnected, .disconnect_indication = lapbeth_disconnected, .data_indication = lapbeth_data_indication, .data_transmit = lapbeth_data_transmit, }; /* open/close a device */ static int lapbeth_open(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; napi_enable(&lapbeth->napi); err = lapb_register(dev, &lapbeth_callbacks); if (err != LAPB_OK) { napi_disable(&lapbeth->napi); pr_err("lapb_register error: %d\n", err); return -ENODEV; } spin_lock_bh(&lapbeth->up_lock); lapbeth->up = true; spin_unlock_bh(&lapbeth->up_lock); return 0; } static int lapbeth_close(struct net_device *dev) { struct lapbethdev *lapbeth = netdev_priv(dev); int err; spin_lock_bh(&lapbeth->up_lock); lapbeth->up = false; spin_unlock_bh(&lapbeth->up_lock); err = lapb_unregister(dev); if (err != LAPB_OK) pr_err("lapb_unregister error: %d\n", err); napi_disable(&lapbeth->napi); return 0; } /* ------------------------------------------------------------------------ */ static const struct net_device_ops lapbeth_netdev_ops = { .ndo_open = lapbeth_open, .ndo_stop = lapbeth_close, .ndo_start_xmit = lapbeth_xmit, .ndo_set_mac_address = lapbeth_set_mac_address, }; static void lapbeth_setup(struct net_device *dev) { dev->netdev_ops = &lapbeth_netdev_ops; dev->needs_free_netdev = true; dev->type = ARPHRD_X25; dev->hard_header_len = 0; dev->mtu = 1000; dev->addr_len = 0; } /* Setup a new device. */ static int lapbeth_new_device(struct net_device *dev) { struct net_device *ndev; struct lapbethdev *lapbeth; int rc = -ENOMEM; ASSERT_RTNL(); if (dev->type != ARPHRD_ETHER) return -EINVAL; ndev = alloc_netdev(sizeof(*lapbeth), "lapb%d", NET_NAME_UNKNOWN, lapbeth_setup); if (!ndev) goto out; /* When transmitting data: * first this driver removes a pseudo header of 1 byte, * then the lapb module prepends an LAPB header of at most 3 bytes, * then this driver prepends a length field of 2 bytes, * then the underlying Ethernet device prepends its own header. */ ndev->needed_headroom = -1 + 3 + 2 + dev->hard_header_len + dev->needed_headroom; ndev->needed_tailroom = dev->needed_tailroom; lapbeth = netdev_priv(ndev); lapbeth->axdev = ndev; dev_hold(dev); lapbeth->ethdev = dev; lapbeth->up = false; spin_lock_init(&lapbeth->up_lock); skb_queue_head_init(&lapbeth->rx_queue); netif_napi_add_weight(ndev, &lapbeth->napi, lapbeth_napi_poll, 16); rc = -EIO; if (register_netdevice(ndev)) goto fail; list_add_rcu(&lapbeth->node, &lapbeth_devices); rc = 0; out: return rc; fail: dev_put(dev); free_netdev(ndev); goto out; } /* Free a lapb network device. */ static void lapbeth_free_device(struct lapbethdev *lapbeth) { dev_put(lapbeth->ethdev); list_del_rcu(&lapbeth->node); unregister_netdevice(lapbeth->axdev); } /* Handle device status changes. * * Called from notifier with RTNL held. */ static int lapbeth_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct lapbethdev *lapbeth; struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; if (!dev_is_ethdev(dev) && !lapbeth_get_x25_dev(dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UP: /* New ethernet device -> new LAPB interface */ if (!lapbeth_get_x25_dev(dev)) lapbeth_new_device(dev); break; case NETDEV_GOING_DOWN: /* ethernet device closes -> close LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) dev_close(lapbeth->axdev); break; case NETDEV_UNREGISTER: /* ethernet device disappears -> remove LAPB interface */ lapbeth = lapbeth_get_x25_dev(dev); if (lapbeth) lapbeth_free_device(lapbeth); break; } return NOTIFY_DONE; } /* ------------------------------------------------------------------------ */ static struct packet_type lapbeth_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_DEC), .func = lapbeth_rcv, }; static struct notifier_block lapbeth_dev_notifier = { .notifier_call = lapbeth_device_event, }; static const char banner[] __initconst = KERN_INFO "LAPB Ethernet driver version 0.02\n"; static int __init lapbeth_init_driver(void) { dev_add_pack(&lapbeth_packet_type); register_netdevice_notifier(&lapbeth_dev_notifier); printk(banner); return 0; } module_init(lapbeth_init_driver); static void __exit lapbeth_cleanup_driver(void) { struct lapbethdev *lapbeth; struct list_head *entry, *tmp; dev_remove_pack(&lapbeth_packet_type); unregister_netdevice_notifier(&lapbeth_dev_notifier); rtnl_lock(); list_for_each_safe(entry, tmp, &lapbeth_devices) { lapbeth = list_entry(entry, struct lapbethdev, node); dev_put(lapbeth->ethdev); unregister_netdevice(lapbeth->axdev); } rtnl_unlock(); } module_exit(lapbeth_cleanup_driver); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The unofficial LAPB over Ethernet driver"); MODULE_LICENSE("GPL"); |
6 6 2 2 6 6 6 6 2 2 8 1 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles * or rs-channels. It also implements echoing, cooked mode etc. * * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. * * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the * tty_struct and tty_queue structures. Previously there was an array * of 256 tty_struct's which was statically allocated, and the * tty_queue structures were allocated at boot time. Both are now * dynamically allocated only when the tty is open. * * Also restructured routines so that there is more of a separation * between the high-level tty routines (tty_io.c and tty_ioctl.c) and * the low-level tty routines (serial.c, pty.c, console.c). This * makes for cleaner and more compact code. -TYT, 9/17/92 * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). * * NOTE: pay no attention to the line discipline code (yet); its * interface is still subject to change in this version... * -- TYT, 1/31/92 * * Added functionality to the OPOST tty handling. No delays, but all * other bits should be there. * -- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993. * * Rewrote canonical mode and added more termios flags. * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 * * Reorganized FASYNC support so mouse code can share it. * -- ctm@ardi.com, 9Sep95 * * New TIOCLINUX variants added. * -- mj@k332.feld.cvut.cz, 19-Nov-95 * * Restrict vt switching via ioctl() * -- grif@cs.ucr.edu, 5-Dec-95 * * Move console and virtual terminal code to more appropriate files, * implement CONFIG_VT and generalize console device interface. * -- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97 * * Rewrote tty_init_dev and tty_release_dev to eliminate races. * -- Bill Hawes <whawes@star.net>, June 97 * * Added devfs support. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998 * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * * Reduced memory usage for older ARM systems * -- Russell King <rmk@arm.linux.org.uk> * * Move do_SAK() into process context. Less stack use in devfs functions. * alloc_tty_struct() always uses kmalloc() * -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01 */ #include <linux/types.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/devpts_fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/console.h> #include <linux/timer.h> #include <linux/ctype.h> #include <linux/kd.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/ppp-ioctl.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/seq_file.h> #include <linux/serial.h> #include <linux/ratelimit.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/termios_internal.h> #include <linux/fs.h> #include <linux/kbd_kern.h> #include <linux/vt_kern.h> #include <linux/selection.h> #include <linux/kmod.h> #include <linux/nsproxy.h> #include "tty.h" #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do { } while (0) #endif #define TTY_PARANOIA_CHECK 1 #define CHECK_TTY_COUNT 1 struct ktermios tty_std_termios = { /* for the benefit of tty drivers */ .c_iflag = ICRNL | IXON, .c_oflag = OPOST | ONLCR, .c_cflag = B38400 | CS8 | CREAD | HUPCL, .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN, .c_cc = INIT_C_CC, .c_ispeed = 38400, .c_ospeed = 38400, /* .c_line = N_TTY, */ }; EXPORT_SYMBOL(tty_std_termios); /* This list gets poked at by procfs and various bits of boot up code. This * could do with some rationalisation such as pulling the tty proc function * into this file. */ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ /* Mutex to protect creating and releasing a tty */ DEFINE_MUTEX(tty_mutex); static ssize_t tty_read(struct kiocb *, struct iov_iter *); static ssize_t tty_write(struct kiocb *, struct iov_iter *); static __poll_t tty_poll(struct file *, poll_table *); static int tty_open(struct inode *, struct file *); #ifdef CONFIG_COMPAT static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define tty_compat_ioctl NULL #endif static int __tty_fasync(int fd, struct file *filp, int on); static int tty_fasync(int fd, struct file *filp, int on); static void release_tty(struct tty_struct *tty, int idx); /** * free_tty_struct - free a disused tty * @tty: tty struct to free * * Free the write buffers, tty queue and tty memory itself. * * Locking: none. Must be called after tty is definitely unused */ static void free_tty_struct(struct tty_struct *tty) { tty_ldisc_deinit(tty); put_device(tty->dev); kvfree(tty->write_buf); kfree(tty); } static inline struct tty_struct *file_tty(struct file *file) { return ((struct tty_file_private *)file->private_data)->tty; } int tty_alloc_file(struct file *file) { struct tty_file_private *priv; priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; file->private_data = priv; return 0; } /* Associate a new file with the tty structure */ void tty_add_file(struct tty_struct *tty, struct file *file) { struct tty_file_private *priv = file->private_data; priv->tty = tty; priv->file = file; spin_lock(&tty->files_lock); list_add(&priv->list, &tty->tty_files); spin_unlock(&tty->files_lock); } /** * tty_free_file - free file->private_data * @file: to free private_data of * * This shall be used only for fail path handling when tty_add_file was not * called yet. */ void tty_free_file(struct file *file) { struct tty_file_private *priv = file->private_data; file->private_data = NULL; kfree(priv); } /* Delete file from its tty */ static void tty_del_file(struct file *file) { struct tty_file_private *priv = file->private_data; struct tty_struct *tty = priv->tty; spin_lock(&tty->files_lock); list_del(&priv->list); spin_unlock(&tty->files_lock); tty_free_file(file); } /** * tty_name - return tty naming * @tty: tty structure * * Convert a tty structure into a name. The name reflects the kernel naming * policy and if udev is in use may not reflect user space * * Locking: none */ const char *tty_name(const struct tty_struct *tty) { if (!tty) /* Hmm. NULL pointer. That's fun. */ return "NULL tty"; return tty->name; } EXPORT_SYMBOL(tty_name); const char *tty_driver_name(const struct tty_struct *tty) { if (!tty || !tty->driver) return ""; return tty->driver->name; } static int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK if (!tty) { pr_warn("(%d:%d): %s: NULL tty\n", imajor(inode), iminor(inode), routine); return 1; } #endif return 0; } /* Caller must hold tty_lock */ static void check_tty_count(struct tty_struct *tty, const char *routine) { #ifdef CHECK_TTY_COUNT struct list_head *p; int count = 0, kopen_count = 0; spin_lock(&tty->files_lock); list_for_each(p, &tty->tty_files) { count++; } spin_unlock(&tty->files_lock); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) count++; if (tty_port_kopened(tty->port)) kopen_count++; if (tty->count != (count + kopen_count)) { tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n", routine, tty->count, count, kopen_count); } #endif } /** * get_tty_driver - find device of a tty * @device: device identifier * @index: returns the index of the tty * * This routine returns a tty driver structure, given a device number and also * passes back the index number. * * Locking: caller must hold tty_mutex */ static struct tty_driver *get_tty_driver(dev_t device, int *index) { struct tty_driver *p; list_for_each_entry(p, &tty_drivers, tty_drivers) { dev_t base = MKDEV(p->major, p->minor_start); if (device < base || device >= base + p->num) continue; *index = device - base; return tty_driver_kref_get(p); } return NULL; } /** * tty_dev_name_to_number - return dev_t for device name * @name: user space name of device under /dev * @number: pointer to dev_t that this function will populate * * This function converts device names like ttyS0 or ttyUSB1 into dev_t like * (4, 64) or (188, 1). If no corresponding driver is registered then the * function returns -%ENODEV. * * Locking: this acquires tty_mutex to protect the tty_drivers list from * being modified while we are traversing it, and makes sure to * release it before exiting. */ int tty_dev_name_to_number(const char *name, dev_t *number) { struct tty_driver *p; int ret; int index, prefix_length = 0; const char *str; for (str = name; *str && !isdigit(*str); str++) ; if (!*str) return -EINVAL; ret = kstrtoint(str, 10, &index); if (ret) return ret; prefix_length = str - name; guard(mutex)(&tty_mutex); list_for_each_entry(p, &tty_drivers, tty_drivers) if (prefix_length == strlen(p->name) && strncmp(name, p->name, prefix_length) == 0) { if (index < p->num) { *number = MKDEV(p->major, p->minor_start + index); return 0; } } return -ENODEV; } EXPORT_SYMBOL_GPL(tty_dev_name_to_number); #ifdef CONFIG_CONSOLE_POLL /** * tty_find_polling_driver - find device of a polled tty * @name: name string to match * @line: pointer to resulting tty line nr * * This routine returns a tty driver structure, given a name and the condition * that the tty driver is capable of polled operation. */ struct tty_driver *tty_find_polling_driver(char *name, int *line) { struct tty_driver *p, *res = NULL; int tty_line = 0; int len; char *str, *stp; for (str = name; *str; str++) if ((*str >= '0' && *str <= '9') || *str == ',') break; if (!*str) return NULL; len = str - name; tty_line = simple_strtoul(str, &str, 10); mutex_lock(&tty_mutex); /* Search through the tty devices to look for a match */ list_for_each_entry(p, &tty_drivers, tty_drivers) { if (!len || strncmp(name, p->name, len) != 0) continue; stp = str; if (*stp == ',') stp++; if (*stp == '\0') stp = NULL; if (tty_line >= 0 && tty_line < p->num && p->ops && p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) { res = tty_driver_kref_get(p); *line = tty_line; break; } } mutex_unlock(&tty_mutex); return res; } EXPORT_SYMBOL_GPL(tty_find_polling_driver); #endif static ssize_t hung_up_tty_read(struct kiocb *iocb, struct iov_iter *to) { return 0; } static ssize_t hung_up_tty_write(struct kiocb *iocb, struct iov_iter *from) { return -EIO; } /* No kernel lock held - none needed ;) */ static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait) { return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM; } static long hung_up_tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static long hung_up_tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return cmd == TIOCSPGRP ? -ENOTTY : -EIO; } static int hung_up_tty_fasync(int fd, struct file *file, int on) { return -ENOTTY; } static void tty_show_fdinfo(struct seq_file *m, struct file *file) { struct tty_struct *tty = file_tty(file); if (tty && tty->ops && tty->ops->show_fdinfo) tty->ops->show_fdinfo(tty, m); } static const struct file_operations tty_fops = { .read_iter = tty_read, .write_iter = tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, .show_fdinfo = tty_show_fdinfo, }; static const struct file_operations console_fops = { .read_iter = tty_read, .write_iter = redirected_tty_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, .open = tty_open, .release = tty_release, .fasync = tty_fasync, }; static const struct file_operations hung_up_tty_fops = { .read_iter = hung_up_tty_read, .write_iter = hung_up_tty_write, .poll = hung_up_tty_poll, .unlocked_ioctl = hung_up_tty_ioctl, .compat_ioctl = hung_up_tty_compat_ioctl, .release = tty_release, .fasync = hung_up_tty_fasync, }; static DEFINE_SPINLOCK(redirect_lock); static struct file *redirect; /** * tty_wakeup - request more data * @tty: terminal * * Internal and external helper for wakeups of tty. This function informs the * line discipline if present that the driver is ready to receive more output * data. */ void tty_wakeup(struct tty_struct *tty) { struct tty_ldisc *ld; if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->write_wakeup) ld->ops->write_wakeup(tty); tty_ldisc_deref(ld); } } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } EXPORT_SYMBOL_GPL(tty_wakeup); /** * tty_release_redirect - Release a redirect on a pty if present * @tty: tty device * * This is available to the pty code so if the master closes, if the slave is a * redirect it can release the redirect. */ static struct file *tty_release_redirect(struct tty_struct *tty) { struct file *f = NULL; spin_lock(&redirect_lock); if (redirect && file_tty(redirect) == tty) { f = redirect; redirect = NULL; } spin_unlock(&redirect_lock); return f; } /** * __tty_hangup - actual handler for hangup events * @tty: tty device * @exit_session: if non-zero, signal all foreground group processes * * This can be called by a "kworker" kernel thread. That is process synchronous * but doesn't hold any locks, so we need to make sure we have the appropriate * locks for what we're doing. * * The hangup event clears any pending redirections onto the hung up device. It * ensures future writes will error and it does the needed line discipline * hangup and signal delivery. The tty object itself remains intact. * * Locking: * * BTM * * * redirect lock for undoing redirection * * file list lock for manipulating list of ttys * * tty_ldiscs_lock from called functions * * termios_rwsem resetting termios data * * tasklist_lock to walk task list for hangup event * * * ->siglock to protect ->signal/->sighand * */ static void __tty_hangup(struct tty_struct *tty, int exit_session) { struct file *cons_filp = NULL; struct file *filp, *f; struct tty_file_private *priv; int closecount = 0, n; int refs; if (!tty) return; f = tty_release_redirect(tty); tty_lock(tty); if (test_bit(TTY_HUPPED, &tty->flags)) { tty_unlock(tty); return; } /* * Some console devices aren't actually hung up for technical and * historical reasons, which can lead to indefinite interruptible * sleep in n_tty_read(). The following explicitly tells * n_tty_read() to abort readers. */ set_bit(TTY_HUPPING, &tty->flags); /* inuse_filps is protected by the single tty lock, * this really needs to change if we want to flush the * workqueue with the lock held. */ check_tty_count(tty, "tty_hangup"); spin_lock(&tty->files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(priv, &tty->tty_files, list) { filp = priv->file; if (filp->f_op->write_iter == redirected_tty_write) cons_filp = filp; if (filp->f_op->write_iter != tty_write) continue; closecount++; __tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } spin_unlock(&tty->files_lock); refs = tty_signal_session_leader(tty, exit_session); /* Account for the p->signal references we killed */ while (refs--) tty_kref_put(tty); tty_ldisc_hangup(tty, cons_filp != NULL); spin_lock_irq(&tty->ctrl.lock); clear_bit(TTY_THROTTLED, &tty->flags); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); put_pid(tty->ctrl.session); put_pid(tty->ctrl.pgrp); tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; tty->ctrl.pktstatus = 0; spin_unlock_irq(&tty->ctrl.lock); /* * If one of the devices matches a console pointer, we * cannot just call hangup() because that will cause * tty->count and state->count to go out of sync. * So we just call close() the right number of times. */ if (cons_filp) { if (tty->ops->close) for (n = 0; n < closecount; n++) tty->ops->close(tty, cons_filp); } else if (tty->ops->hangup) tty->ops->hangup(tty); /* * We don't want to have driver/ldisc interactions beyond the ones * we did here. The driver layer expects no calls after ->hangup() * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) fput(f); } static void do_tty_hangup(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); __tty_hangup(tty, 0); } /** * tty_hangup - trigger a hangup event * @tty: tty to hangup * * A carrier loss (virtual or otherwise) has occurred on @tty. Schedule a * hangup sequence to run after this event. */ void tty_hangup(struct tty_struct *tty) { tty_debug_hangup(tty, "hangup\n"); schedule_work(&tty->hangup_work); } EXPORT_SYMBOL(tty_hangup); /** * tty_vhangup - process vhangup * @tty: tty to hangup * * The user has asked via system call for the terminal to be hung up. We do * this synchronously so that when the syscall returns the process is complete. * That guarantee is necessary for security reasons. */ void tty_vhangup(struct tty_struct *tty) { tty_debug_hangup(tty, "vhangup\n"); __tty_hangup(tty, 0); } EXPORT_SYMBOL(tty_vhangup); /** * tty_vhangup_self - process vhangup for own ctty * * Perform a vhangup on the current controlling tty */ void tty_vhangup_self(void) { struct tty_struct *tty; tty = get_current_tty(); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } } /** * tty_vhangup_session - hangup session leader exit * @tty: tty to hangup * * The session leader is exiting and hanging up its controlling terminal. * Every process in the foreground process group is signalled %SIGHUP. * * We do this synchronously so that when the syscall returns the process is * complete. That guarantee is necessary for security reasons. */ void tty_vhangup_session(struct tty_struct *tty) { tty_debug_hangup(tty, "session hangup\n"); __tty_hangup(tty, 1); } /** * tty_hung_up_p - was tty hung up * @filp: file pointer of tty * * Return: true if the tty has been subject to a vhangup or a carrier loss */ int tty_hung_up_p(struct file *filp) { return (filp && filp->f_op == &hung_up_tty_fops); } EXPORT_SYMBOL(tty_hung_up_p); void __stop_tty(struct tty_struct *tty) { if (tty->flow.stopped) return; tty->flow.stopped = true; if (tty->ops->stop) tty->ops->stop(tty); } /** * stop_tty - propagate flow control * @tty: tty to stop * * Perform flow control to the driver. May be called on an already stopped * device and will not re-call the &tty_driver->stop() method. * * This functionality is used by both the line disciplines for halting incoming * flow and by the driver. It may therefore be called from any context, may be * under the tty %atomic_write_lock but not always. * * Locking: * flow.lock */ void stop_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow.lock, flags); __stop_tty(tty); spin_unlock_irqrestore(&tty->flow.lock, flags); } EXPORT_SYMBOL(stop_tty); void __start_tty(struct tty_struct *tty) { if (!tty->flow.stopped || tty->flow.tco_stopped) return; tty->flow.stopped = false; if (tty->ops->start) tty->ops->start(tty); tty_wakeup(tty); } /** * start_tty - propagate flow control * @tty: tty to start * * Start a tty that has been stopped if at all possible. If @tty was previously * stopped and is now being started, the &tty_driver->start() method is invoked * and the line discipline woken. * * Locking: * flow.lock */ void start_tty(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty->flow.lock, flags); __start_tty(tty); spin_unlock_irqrestore(&tty->flow.lock, flags); } EXPORT_SYMBOL(start_tty); static void tty_update_time(struct tty_struct *tty, bool mtime) { time64_t sec = ktime_get_real_seconds(); struct tty_file_private *priv; spin_lock(&tty->files_lock); list_for_each_entry(priv, &tty->tty_files, list) { struct inode *inode = file_inode(priv->file); struct timespec64 time = mtime ? inode_get_mtime(inode) : inode_get_atime(inode); /* * We only care if the two values differ in anything other than the * lower three bits (i.e every 8 seconds). If so, then we can update * the time of the tty device, otherwise it could be construded as a * security leak to let userspace know the exact timing of the tty. */ if ((sec ^ time.tv_sec) & ~7) { if (mtime) inode_set_mtime(inode, sec, 0); else inode_set_atime(inode, sec, 0); } } spin_unlock(&tty->files_lock); } /* * Iterate on the ldisc ->read() function until we've gotten all * the data the ldisc has for us. * * The "cookie" is something that the ldisc read function can fill * in to let us know that there is more data to be had. * * We promise to continue to call the ldisc until it stops returning * data or clears the cookie. The cookie may be something that the * ldisc maintains state for and needs to free. */ static ssize_t iterate_tty_read(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *to) { void *cookie = NULL; unsigned long offset = 0; ssize_t retval = 0; size_t copied, count = iov_iter_count(to); u8 kernel_buf[64]; do { ssize_t size = min(count, sizeof(kernel_buf)); size = ld->ops->read(tty, file, kernel_buf, size, &cookie, offset); if (!size) break; if (size < 0) { /* Did we have an earlier error (ie -EFAULT)? */ if (retval) break; retval = size; /* * -EOVERFLOW means we didn't have enough space * for a whole packet, and we shouldn't return * a partial result. */ if (retval == -EOVERFLOW) offset = 0; break; } copied = copy_to_iter(kernel_buf, size, to); offset += copied; count -= copied; /* * If the user copy failed, we still need to do another ->read() * call if we had a cookie to let the ldisc clear up. * * But make sure size is zeroed. */ if (unlikely(copied != size)) { count = 0; retval = -EFAULT; } } while (cookie); /* We always clear tty buffer in case they contained passwords */ memzero_explicit(kernel_buf, sizeof(kernel_buf)); return offset ? offset : retval; } /** * tty_read - read method for tty device files * @iocb: kernel I/O control block * @to: destination for the data read * * Perform the read system call function on this terminal device. Checks * for hung up devices before calling the line discipline method. * * Locking: * Locks the line discipline internally while needed. Multiple read calls * may be outstanding in parallel. */ static ssize_t tty_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, inode, "tty_read")) return -EIO; if (!tty || tty_io_error(tty)) return -EIO; /* We want to wait for the line discipline to sort out in this * situation. */ ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_read(iocb, to); ret = -EIO; if (ld->ops->read) ret = iterate_tty_read(ld, tty, file, to); tty_ldisc_deref(ld); if (ret > 0) tty_update_time(tty, false); return ret; } void tty_write_unlock(struct tty_struct *tty) { mutex_unlock(&tty->atomic_write_lock); wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); } int tty_write_lock(struct tty_struct *tty, bool ndelay) { if (!mutex_trylock(&tty->atomic_write_lock)) { if (ndelay) return -EAGAIN; if (mutex_lock_interruptible(&tty->atomic_write_lock)) return -ERESTARTSYS; } return 0; } /* * Split writes up in sane blocksizes to avoid * denial-of-service type attacks */ static ssize_t iterate_tty_write(struct tty_ldisc *ld, struct tty_struct *tty, struct file *file, struct iov_iter *from) { size_t chunk, count = iov_iter_count(from); ssize_t ret, written = 0; ret = tty_write_lock(tty, file->f_flags & O_NDELAY); if (ret < 0) return ret; /* * We chunk up writes into a temporary buffer. This * simplifies low-level drivers immensely, since they * don't have locking issues and user mode accesses. * * But if TTY_NO_WRITE_SPLIT is set, we should use a * big chunk-size.. * * The default chunk-size is 2kB, because the NTTY * layer has problems with bigger chunks. It will * claim to be able to handle more characters than * it actually does. */ chunk = 2048; if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) chunk = 65536; if (count < chunk) chunk = count; /* write_buf/write_cnt is protected by the atomic_write_lock mutex */ if (tty->write_cnt < chunk) { u8 *buf_chunk; if (chunk < 1024) chunk = 1024; buf_chunk = kvmalloc(chunk, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!buf_chunk) { ret = -ENOMEM; goto out; } kvfree(tty->write_buf); tty->write_cnt = chunk; tty->write_buf = buf_chunk; } /* Do the write .. */ for (;;) { size_t size = min(chunk, count); ret = -EFAULT; if (copy_from_iter(tty->write_buf, size, from) != size) break; ret = ld->ops->write(tty, file, tty->write_buf, size); if (ret <= 0) break; written += ret; if (ret > size) break; /* FIXME! Have Al check this! */ if (ret != size) iov_iter_revert(from, size-ret); count -= ret; if (!count) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; cond_resched(); } if (written) { tty_update_time(tty, true); ret = written; } out: tty_write_unlock(tty); return ret; } #ifdef CONFIG_PRINT_QUOTA_WARNING /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct * @msg: the message to write * * This is used for messages that need to be redirected to a specific tty. We * don't put it into the syslog queue right now maybe in the future if really * needed. * * We must still hold the BTM and test the CLOSING flag for the moment. * * This function is DEPRECATED, do not use in new code. */ void tty_write_message(struct tty_struct *tty, char *msg) { if (tty) { mutex_lock(&tty->atomic_write_lock); tty_lock(tty); if (tty->ops->write && tty->count > 0) tty->ops->write(tty, msg, strlen(msg)); tty_unlock(tty); tty_write_unlock(tty); } } #endif static ssize_t file_tty_write(struct file *file, struct kiocb *iocb, struct iov_iter *from) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; if (tty_paranoia_check(tty, file_inode(file), "tty_write")) return -EIO; if (!tty || !tty->ops->write || tty_io_error(tty)) return -EIO; /* Short term debug to catch buggy drivers */ if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_write(iocb, from); if (!ld->ops->write) ret = -EIO; else ret = iterate_tty_write(ld, tty, file, from); tty_ldisc_deref(ld); return ret; } /** * tty_write - write method for tty device file * @iocb: kernel I/O control block * @from: iov_iter with data to write * * Write data to a tty device via the line discipline. * * Locking: * Locks the line discipline as required * Writes to the tty driver are serialized by the atomic_write_lock * and are then processed in chunks to the device. The line * discipline write method will not be invoked in parallel for * each device. */ static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) { return file_tty_write(iocb->ki_filp, iocb, from); } ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *p = NULL; spin_lock(&redirect_lock); if (redirect) p = get_file(redirect); spin_unlock(&redirect_lock); /* * We know the redirected tty is just another tty, we can * call file_tty_write() directly with that file pointer. */ if (p) { ssize_t res; res = file_tty_write(p, iocb, iter); fput(p); return res; } return tty_write(iocb, iter); } /** * tty_send_xchar - send priority character * @tty: the tty to send to * @ch: xchar to send * * Send a high priority character to the tty even if stopped. * * Locking: none for xchar method, write ordering for write method. */ int tty_send_xchar(struct tty_struct *tty, u8 ch) { bool was_stopped = tty->flow.stopped; if (tty->ops->send_xchar) { down_read(&tty->termios_rwsem); tty->ops->send_xchar(tty, ch); up_read(&tty->termios_rwsem); return 0; } if (tty_write_lock(tty, false) < 0) return -ERESTARTSYS; down_read(&tty->termios_rwsem); if (was_stopped) start_tty(tty); tty->ops->write(tty, &ch, 1); if (was_stopped) stop_tty(tty); up_read(&tty->termios_rwsem); tty_write_unlock(tty); return 0; } /** * pty_line_name - generate name for a pty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 6 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static void pty_line_name(struct tty_driver *driver, int index, char *p) { static const char ptychar[] = "pqrstuvwxyzabcde"; int i = index + driver->name_base; /* ->name is initialized to "ttyp", but "tty" is expected */ sprintf(p, "%s%c%x", driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name, ptychar[i >> 4 & 0xf], i & 0xf); } /** * tty_line_name - generate name for a tty * @driver: the tty driver in use * @index: the minor number * @p: output buffer of at least 7 bytes * * Generate a name from a @driver reference and write it to the output buffer * @p. * * Locking: None */ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p) { if (driver->flags & TTY_DRIVER_UNNUMBERED_NODE) return sprintf(p, "%s", driver->name); else return sprintf(p, "%s%d", driver->name, index + driver->name_base); } /** * tty_driver_lookup_tty() - find an existing tty, if any * @driver: the driver for the tty * @file: file object * @idx: the minor number * * Return: the tty, if found. If not found, return %NULL or ERR_PTR() if the * driver lookup() method returns an error. * * Locking: tty_mutex must be held. If the tty is found, bump the tty kref. */ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; if (driver->ops->lookup) { if (!file) tty = ERR_PTR(-EIO); else tty = driver->ops->lookup(driver, file, idx); } else { if (idx >= driver->num) return ERR_PTR(-EINVAL); tty = driver->ttys[idx]; } if (!IS_ERR(tty)) tty_kref_get(tty); return tty; } /** * tty_init_termios - helper for termios setup * @tty: the tty to set up * * Initialise the termios structure for this tty. This runs under the * %tty_mutex currently so we can be relaxed about ordering. */ void tty_init_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty->termios = tty->driver->init_termios; else { /* Check for lazy saved data */ tp = tty->driver->termios[idx]; if (tp != NULL) { tty->termios = *tp; tty->termios.c_line = tty->driver->init_termios.c_line; } else tty->termios = tty->driver->init_termios; } /* Compatibility until drivers always set this */ tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); } EXPORT_SYMBOL_GPL(tty_init_termios); /** * tty_standard_install - usual tty->ops->install * @driver: the driver for the tty * @tty: the tty * * If the @driver overrides @tty->ops->install, it still can call this function * to perform the standard install operations. */ int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty) { tty_init_termios(tty); tty_driver_kref_get(driver); tty->count++; driver->ttys[tty->index] = tty; return 0; } EXPORT_SYMBOL_GPL(tty_standard_install); /** * tty_driver_install_tty() - install a tty entry in the driver * @driver: the driver for the tty * @tty: the tty * * Install a tty object into the driver tables. The @tty->index field will be * set by the time this is called. This method is responsible for ensuring any * need additional structures are allocated and configured. * * Locking: tty_mutex for now */ static int tty_driver_install_tty(struct tty_driver *driver, struct tty_struct *tty) { return driver->ops->install ? driver->ops->install(driver, tty) : tty_standard_install(driver, tty); } /** * tty_driver_remove_tty() - remove a tty from the driver tables * @driver: the driver for the tty * @tty: tty to remove * * Remove a tty object from the driver tables. The tty->index field will be set * by the time this is called. * * Locking: tty_mutex for now */ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty) { if (driver->ops->remove) driver->ops->remove(driver, tty); else driver->ttys[tty->index] = NULL; } /** * tty_reopen() - fast re-open of an open tty * @tty: the tty to open * * Re-opens on master ptys are not allowed and return -%EIO. * * Locking: Caller must hold tty_lock * Return: 0 on success, -errno on error. */ static int tty_reopen(struct tty_struct *tty) { struct tty_driver *driver = tty->driver; struct tty_ldisc *ld; int retval = 0; if (driver->type == TTY_DRIVER_TYPE_PTY && driver->subtype == PTY_TYPE_MASTER) return -EIO; if (!tty->count) return -EAGAIN; if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) return -EBUSY; ld = tty_ldisc_ref_wait(tty); if (ld) { tty_ldisc_deref(ld); } else { retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) return retval; if (!tty->ldisc) retval = tty_ldisc_reinit(tty, tty->termios.c_line); tty_ldisc_unlock(tty); } if (retval == 0) tty->count++; return retval; } /** * tty_init_dev - initialise a tty device * @driver: tty driver we are opening a device on * @idx: device index * * Prepare a tty device. This may not be a "new" clean device but could also be * an active device. The pty drivers require special handling because of this. * * Locking: * The function is called under the tty_mutex, which protects us from the * tty struct or driver itself going away. * * On exit the tty device has the line discipline attached and a reference * count of 1. If a pair was created for pty/tty use and the other was a pty * master then it too has a reference count of 1. * * WSH 06/09/97: Rewritten to remove races and properly clean up after a failed * open. The new code protects the open with a mutex, so it's really quite * straightforward. The mutex locking can probably be relaxed for the (most * common) case of reopening a tty. * * Return: new tty structure */ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) { struct tty_struct *tty; int retval; /* * First time open is complex, especially for PTY devices. * This code guarantees that either everything succeeds and the * TTY is ready for operation, or else the table slots are vacated * and the allocated memory released. (Except that the termios * may be retained.) */ if (!try_module_get(driver->owner)) return ERR_PTR(-ENODEV); tty = alloc_tty_struct(driver, idx); if (!tty) { retval = -ENOMEM; goto err_module_put; } tty_lock(tty); retval = tty_driver_install_tty(driver, tty); if (retval < 0) goto err_free_tty; if (!tty->port) tty->port = driver->ports[idx]; if (WARN_RATELIMIT(!tty->port, "%s: %s driver does not set tty->port. This would crash the kernel. Fix the driver!\n", __func__, tty->driver->name)) { retval = -EINVAL; goto err_release_lock; } retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err_release_lock; tty->port->itty = tty; /* * Structures all installed ... call the ldisc open routines. * If we fail here just call release_tty to clean up. No need * to decrement the use counts, as release_tty doesn't care. */ retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; tty_ldisc_unlock(tty); /* Return the tty locked so that it cannot vanish under the caller */ return tty; err_free_tty: tty_unlock(tty); free_tty_struct(tty); err_module_put: module_put(driver->owner); return ERR_PTR(retval); /* call the tty release_tty routine to clean out this slot */ err_release_tty: tty_ldisc_unlock(tty); tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n", retval, idx); err_release_lock: tty_unlock(tty); release_tty(tty, idx); return ERR_PTR(retval); } /** * tty_save_termios() - save tty termios data in driver table * @tty: tty whose termios data to save * * Locking: Caller guarantees serialisation with tty_init_termios(). */ void tty_save_termios(struct tty_struct *tty) { struct ktermios *tp; int idx = tty->index; /* If the port is going to reset then it has no termios to save */ if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) return; /* Stash the termios data */ tp = tty->driver->termios[idx]; if (tp == NULL) { tp = kmalloc(sizeof(*tp), GFP_KERNEL); if (tp == NULL) return; tty->driver->termios[idx] = tp; } *tp = tty->termios; } EXPORT_SYMBOL_GPL(tty_save_termios); /** * tty_flush_works - flush all works of a tty/pty pair * @tty: tty device to flush works for (or either end of a pty pair) * * Sync flush all works belonging to @tty (and the 'other' tty). */ static void tty_flush_works(struct tty_struct *tty) { flush_work(&tty->SAK_work); flush_work(&tty->hangup_work); if (tty->link) { flush_work(&tty->link->SAK_work); flush_work(&tty->link->hangup_work); } } /** * release_one_tty - release tty structure memory * @work: work of tty we are obliterating * * Releases memory associated with a tty structure, and clears out the * driver table slots. This function is called when a device is no longer * in use. It also gets called when setup of a device fails. * * Locking: * takes the file list lock internally when working on the list of ttys * that the driver keeps. * * This method gets called from a work queue so that the driver private * cleanup ops can sleep (needed for USB at least) */ static void release_one_tty(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, hangup_work); struct tty_driver *driver = tty->driver; struct module *owner = driver->owner; if (tty->ops->cleanup) tty->ops->cleanup(tty); tty_driver_kref_put(driver); module_put(owner); spin_lock(&tty->files_lock); list_del_init(&tty->tty_files); spin_unlock(&tty->files_lock); put_pid(tty->ctrl.pgrp); put_pid(tty->ctrl.session); free_tty_struct(tty); } static void queue_release_one_tty(struct kref *kref) { struct tty_struct *tty = container_of(kref, struct tty_struct, kref); /* The hangup queue is now free so we can reuse it rather than * waste a chunk of memory for each port. */ INIT_WORK(&tty->hangup_work, release_one_tty); schedule_work(&tty->hangup_work); } /** * tty_kref_put - release a tty kref * @tty: tty device * * Release a reference to the @tty device and if need be let the kref layer * destruct the object for us. */ void tty_kref_put(struct tty_struct *tty) { if (tty) kref_put(&tty->kref, queue_release_one_tty); } EXPORT_SYMBOL(tty_kref_put); /** * release_tty - release tty structure memory * @tty: tty device release * @idx: index of the tty device release * * Release both @tty and a possible linked partner (think pty pair), * and decrement the refcount of the backing module. * * Locking: * tty_mutex * takes the file list lock internally when working on the list of ttys * that the driver keeps. */ static void release_tty(struct tty_struct *tty, int idx) { /* This should always be true but check for the moment */ WARN_ON(tty->index != idx); WARN_ON(!mutex_is_locked(&tty_mutex)); if (tty->ops->shutdown) tty->ops->shutdown(tty); tty_save_termios(tty); tty_driver_remove_tty(tty->driver, tty); if (tty->port) tty->port->itty = NULL; if (tty->link) tty->link->port->itty = NULL; if (tty->port) tty_buffer_cancel_work(tty->port); if (tty->link) tty_buffer_cancel_work(tty->link->port); tty_kref_put(tty->link); tty_kref_put(tty); } /** * tty_release_checks - check a tty before real release * @tty: tty to check * @idx: index of the tty * * Performs some paranoid checking before true release of the @tty. This is a * no-op unless %TTY_PARANOIA_CHECK is defined. */ static int tty_release_checks(struct tty_struct *tty, int idx) { #ifdef TTY_PARANOIA_CHECK if (idx < 0 || idx >= tty->driver->num) { tty_debug(tty, "bad idx %d\n", idx); return -1; } /* not much to check for devpts */ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) return 0; if (tty != tty->driver->ttys[idx]) { tty_debug(tty, "bad driver table[%d] = %p\n", idx, tty->driver->ttys[idx]); return -1; } if (tty->driver->other) { struct tty_struct *o_tty = tty->link; if (o_tty != tty->driver->other->ttys[idx]) { tty_debug(tty, "bad other table[%d] = %p\n", idx, tty->driver->other->ttys[idx]); return -1; } if (o_tty->link != tty) { tty_debug(tty, "bad link = %p\n", o_tty->link); return -1; } } #endif return 0; } /** * tty_kclose - closes tty opened by tty_kopen * @tty: tty device * * Performs the final steps to release and free a tty device. It is the same as * tty_release_struct() except that it also resets %TTY_PORT_KOPENED flag on * @tty->port. */ void tty_kclose(struct tty_struct *tty) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); tty_port_set_kopened(tty->port, 0); release_tty(tty, tty->index); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_kclose); /** * tty_release_struct - release a tty struct * @tty: tty device * @idx: index of the tty * * Performs the final steps to release and free a tty device. It is roughly the * reverse of tty_init_dev(). */ void tty_release_struct(struct tty_struct *tty, int idx) { /* * Ask the line discipline code to release its structures */ tty_ldisc_release(tty); /* Wait for pending work before tty destruction commmences */ tty_flush_works(tty); tty_debug_hangup(tty, "freeing structure\n"); /* * The release_tty function takes care of the details of clearing * the slots and preserving the termios structure. */ mutex_lock(&tty_mutex); release_tty(tty, idx); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL_GPL(tty_release_struct); /** * tty_release - vfs callback for close * @inode: inode of tty * @filp: file pointer for handle to tty * * Called the last time each file handle is closed that references this tty. * There may however be several such references. * * Locking: * Takes BKL. See tty_release_dev(). * * Even releasing the tty structures is a tricky business. We have to be very * careful that the structures are all released at the same time, as interrupts * might otherwise get the wrong pointers. * * WSH 09/09/97: rewritten to avoid some nasty race conditions that could * lead to double frees or releasing memory still in use. */ int tty_release(struct inode *inode, struct file *filp) { struct tty_struct *tty = file_tty(filp); struct tty_struct *o_tty = NULL; int do_sleep, final; int idx; long timeout = 0; int once = 1; if (tty_paranoia_check(tty, inode, __func__)) return 0; tty_lock(tty); check_tty_count(tty, __func__); __tty_fasync(-1, filp, 0); idx = tty->index; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) o_tty = tty->link; if (tty_release_checks(tty, idx)) { tty_unlock(tty); return 0; } tty_debug_hangup(tty, "releasing (count=%d)\n", tty->count); if (tty->ops->close) tty->ops->close(tty, filp); /* If tty is pty master, lock the slave pty (stable lock order) */ tty_lock_slave(o_tty); /* * Sanity check: if tty->count is going to zero, there shouldn't be * any waiters on tty->read_wait or tty->write_wait. We test the * wait queues and kick everyone out _before_ actually starting to * close. This ensures that we won't block while releasing the tty * structure. * * The test for the o_tty closing is necessary, since the master and * slave sides may close in any order. If the slave side closes out * first, its count will be one, since the master side holds an open. * Thus this test wouldn't be triggered at the time the slave closed, * so we do it now. */ while (1) { do_sleep = 0; if (tty->count <= 1) { if (waitqueue_active(&tty->read_wait)) { wake_up_poll(&tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&tty->write_wait)) { wake_up_poll(&tty->write_wait, EPOLLOUT); do_sleep++; } } if (o_tty && o_tty->count <= 1) { if (waitqueue_active(&o_tty->read_wait)) { wake_up_poll(&o_tty->read_wait, EPOLLIN); do_sleep++; } if (waitqueue_active(&o_tty->write_wait)) { wake_up_poll(&o_tty->write_wait, EPOLLOUT); do_sleep++; } } if (!do_sleep) break; if (once) { once = 0; tty_warn(tty, "read/write wait queue active!\n"); } schedule_timeout_killable(timeout); if (timeout < 120 * HZ) timeout = 2 * timeout + 1; else timeout = MAX_SCHEDULE_TIMEOUT; } if (o_tty) { if (--o_tty->count < 0) { tty_warn(tty, "bad slave count (%d)\n", o_tty->count); o_tty->count = 0; } } if (--tty->count < 0) { tty_warn(tty, "bad tty->count (%d)\n", tty->count); tty->count = 0; } /* * We've decremented tty->count, so we need to remove this file * descriptor off the tty->tty_files list; this serves two * purposes: * - check_tty_count sees the correct number of file descriptors * associated with this tty. * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ tty_del_file(filp); /* * Perform some housekeeping before deciding whether to return. * * If _either_ side is closing, make sure there aren't any * processes that still think tty or o_tty is their controlling * tty. */ if (!tty->count) { read_lock(&tasklist_lock); session_clear_tty(tty->ctrl.session); if (o_tty) session_clear_tty(o_tty->ctrl.session); read_unlock(&tasklist_lock); } /* check whether both sides are closing ... */ final = !tty->count && !(o_tty && o_tty->count); tty_unlock_slave(o_tty); tty_unlock(tty); /* At this point, the tty->count == 0 should ensure a dead tty * cannot be re-opened by a racing opener. */ if (!final) return 0; tty_debug_hangup(tty, "final close\n"); tty_release_struct(tty, idx); return 0; } /** * tty_open_current_tty - get locked tty of current task * @device: device number * @filp: file pointer to tty * @return: locked tty of the current task iff @device is /dev/tty * * Performs a re-open of the current task's controlling tty. * * We cannot return driver and index like for the other nodes because devpts * will not work then. It expects inodes to be from devpts FS. */ static struct tty_struct *tty_open_current_tty(dev_t device, struct file *filp) { struct tty_struct *tty; int retval; if (device != MKDEV(TTYAUX_MAJOR, 0)) return NULL; tty = get_current_tty(); if (!tty) return ERR_PTR(-ENXIO); filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ tty_lock(tty); tty_kref_put(tty); /* safe to drop the kref now */ retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } return tty; } /** * tty_lookup_driver - lookup a tty driver for a given device file * @device: device number * @filp: file pointer to tty * @index: index for the device in the @return driver * * If returned value is not erroneous, the caller is responsible to decrement * the refcount by tty_driver_kref_put(). * * Locking: %tty_mutex protects get_tty_driver() * * Return: driver for this inode (with increased refcount) */ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp, int *index) { struct tty_driver *driver = NULL; switch (device) { #ifdef CONFIG_VT case MKDEV(TTY_MAJOR, 0): { extern struct tty_driver *console_driver; driver = tty_driver_kref_get(console_driver); *index = fg_console; break; } #endif case MKDEV(TTYAUX_MAJOR, 1): { struct tty_driver *console_driver = console_device(index); if (console_driver) { driver = tty_driver_kref_get(console_driver); if (driver && filp) { /* Don't let /dev/console block */ filp->f_flags |= O_NONBLOCK; break; } } if (driver) tty_driver_kref_put(driver); return ERR_PTR(-ENODEV); } default: driver = get_tty_driver(device, index); if (!driver) return ERR_PTR(-ENODEV); break; } return driver; } static struct tty_struct *tty_kopen(dev_t device, int shared) { struct tty_struct *tty; struct tty_driver *driver; int index = -1; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, NULL, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, NULL, index); if (IS_ERR(tty) || shared) goto out; if (tty) { /* drop kref from tty_driver_lookup_tty() */ tty_kref_put(tty); tty = ERR_PTR(-EBUSY); } else { /* tty_init_dev returns tty with the tty_lock held */ tty = tty_init_dev(driver, index); if (IS_ERR(tty)) goto out; tty_port_set_kopened(tty->port, 1); } out: mutex_unlock(&tty_mutex); tty_driver_kref_put(driver); return tty; } /** * tty_kopen_exclusive - open a tty device for kernel * @device: dev_t of device to open * * Opens tty exclusively for kernel. Performs the driver lookup, makes sure * it's not already opened and performs the first-time tty initialization. * * Claims the global %tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized &tty_struct */ struct tty_struct *tty_kopen_exclusive(dev_t device) { return tty_kopen(device, 0); } EXPORT_SYMBOL_GPL(tty_kopen_exclusive); /** * tty_kopen_shared - open a tty device for shared in-kernel use * @device: dev_t of device to open * * Opens an already existing tty for in-kernel use. Compared to * tty_kopen_exclusive() above it doesn't ensure to be the only user. * * Locking: identical to tty_kopen() above. */ struct tty_struct *tty_kopen_shared(dev_t device) { return tty_kopen(device, 1); } EXPORT_SYMBOL_GPL(tty_kopen_shared); /** * tty_open_by_driver - open a tty device * @device: dev_t of device to open * @filp: file pointer to tty * * Performs the driver lookup, checks for a reopen, or otherwise performs the * first-time tty initialization. * * * Claims the global tty_mutex to serialize: * * concurrent first-time tty initialization * * concurrent tty driver removal w/ lookup * * concurrent tty removal from driver table * * Return: the locked initialized or re-opened &tty_struct */ static struct tty_struct *tty_open_by_driver(dev_t device, struct file *filp) { struct tty_struct *tty; struct tty_driver *driver = NULL; int index = -1; int retval; mutex_lock(&tty_mutex); driver = tty_lookup_driver(device, filp, &index); if (IS_ERR(driver)) { mutex_unlock(&tty_mutex); return ERR_CAST(driver); } /* check whether we're reopening an existing tty */ tty = tty_driver_lookup_tty(driver, filp, index); if (IS_ERR(tty)) { mutex_unlock(&tty_mutex); goto out; } if (tty) { if (tty_port_kopened(tty->port)) { tty_kref_put(tty); mutex_unlock(&tty_mutex); tty = ERR_PTR(-EBUSY); goto out; } mutex_unlock(&tty_mutex); retval = tty_lock_interruptible(tty); tty_kref_put(tty); /* drop kref from tty_driver_lookup_tty() */ if (retval) { if (retval == -EINTR) retval = -ERESTARTSYS; tty = ERR_PTR(retval); goto out; } retval = tty_reopen(tty); if (retval < 0) { tty_unlock(tty); tty = ERR_PTR(retval); } } else { /* Returns with the tty_lock held for now */ tty = tty_init_dev(driver, index); mutex_unlock(&tty_mutex); } out: tty_driver_kref_put(driver); return tty; } /** * tty_open - open a tty device * @inode: inode of device file * @filp: file pointer to tty * * tty_open() and tty_release() keep up the tty count that contains the number * of opens done on a tty. We cannot use the inode-count, as different inodes * might point to the same tty. * * Open-counting is needed for pty masters, as well as for keeping track of * serial lines: DTR is dropped when the last close happens. * (This is not done solely through tty->count, now. - Ted 1/27/92) * * The termios state of a pty is reset on the first open so that settings don't * persist across reuse. * * Locking: * * %tty_mutex protects tty, tty_lookup_driver() and tty_init_dev(). * * @tty->count should protect the rest. * * ->siglock protects ->signal/->sighand * * Note: the tty_unlock/lock cases without a ref are only safe due to %tty_mutex */ static int tty_open(struct inode *inode, struct file *filp) { struct tty_struct *tty; int noctty, retval; dev_t device = inode->i_rdev; unsigned saved_flags = filp->f_flags; nonseekable_open(inode, filp); retry_open: retval = tty_alloc_file(filp); if (retval) return -ENOMEM; tty = tty_open_current_tty(device, filp); if (!tty) tty = tty_open_by_driver(device, filp); if (IS_ERR(tty)) { tty_free_file(filp); retval = PTR_ERR(tty); if (retval != -EAGAIN || signal_pending(current)) return retval; schedule(); goto retry_open; } tty_add_file(tty, filp); check_tty_count(tty, __func__); tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); if (tty->ops->open) retval = tty->ops->open(tty, filp); else retval = -ENODEV; filp->f_flags = saved_flags; if (retval) { tty_debug_hangup(tty, "open error %d, releasing\n", retval); tty_unlock(tty); /* need to call tty_release without BTM */ tty_release(inode, filp); if (retval != -ERESTARTSYS) return retval; if (signal_pending(current)) return retval; schedule(); /* * Need to reset f_op in case a hangup happened. */ if (tty_hung_up_p(filp)) filp->f_op = &tty_fops; goto retry_open; } clear_bit(TTY_HUPPED, &tty->flags); noctty = (filp->f_flags & O_NOCTTY) || (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || device == MKDEV(TTYAUX_MAJOR, 1) || (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); if (!noctty) tty_open_proc_set_tty(filp, tty); tty_unlock(tty); return 0; } /** * tty_poll - check tty status * @filp: file being polled * @wait: poll wait structures to update * * Call the line discipline polling method to obtain the poll status of the * device. * * Locking: locks called line discipline but ldisc poll method may be * re-entered freely by other callers. */ static __poll_t tty_poll(struct file *filp, poll_table *wait) { struct tty_struct *tty = file_tty(filp); struct tty_ldisc *ld; __poll_t ret = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_poll")) return 0; ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_poll(filp, wait); if (ld->ops->poll) ret = ld->ops->poll(tty, filp, wait); tty_ldisc_deref(ld); return ret; } static int __tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); unsigned long flags; int retval = 0; if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; if (on) { retval = file_f_owner_allocate(filp); if (retval) goto out; } retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; if (on) { enum pid_type type; struct pid *pid; spin_lock_irqsave(&tty->ctrl.lock, flags); if (tty->ctrl.pgrp) { pid = tty->ctrl.pgrp; type = PIDTYPE_PGID; } else { pid = task_pid(current); type = PIDTYPE_TGID; } get_pid(pid); spin_unlock_irqrestore(&tty->ctrl.lock, flags); __f_setown(filp, pid, type, 0); put_pid(pid); retval = 0; } out: return retval; } static int tty_fasync(int fd, struct file *filp, int on) { struct tty_struct *tty = file_tty(filp); int retval = -ENOTTY; tty_lock(tty); if (!tty_hung_up_p(filp)) retval = __tty_fasync(fd, filp, on); tty_unlock(tty); return retval; } static bool tty_legacy_tiocsti __read_mostly = IS_ENABLED(CONFIG_LEGACY_TIOCSTI); /** * tiocsti - fake input character * @tty: tty to fake input into * @p: pointer to character * * Fake input to a tty device. Does the necessary locking and input management. * * FIXME: does not honour flow control ?? * * Locking: * * Called functions take tty_ldiscs_lock * * current->signal->tty check is safe without locks */ static int tiocsti(struct tty_struct *tty, u8 __user *p) { struct tty_ldisc *ld; u8 ch; if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) return -EFAULT; tty_audit_tiocsti(tty, ch); ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; tty_buffer_lock_exclusive(tty->port); if (ld->ops->receive_buf) ld->ops->receive_buf(tty, &ch, NULL, 1); tty_buffer_unlock_exclusive(tty->port); tty_ldisc_deref(ld); return 0; } /** * tiocgwinsz - implement window query ioctl * @tty: tty * @arg: user buffer for result * * Copies the kernel idea of the window size into the user buffer. * * Locking: @tty->winsize_mutex is taken to ensure the winsize data is * consistent. */ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg) { int err; mutex_lock(&tty->winsize_mutex); err = copy_to_user(arg, &tty->winsize, sizeof(*arg)); mutex_unlock(&tty->winsize_mutex); return err ? -EFAULT : 0; } /** * tty_do_resize - resize event * @tty: tty being resized * @ws: new dimensions * * Update the termios variables and send the necessary signals to peform a * terminal resize correctly. */ int tty_do_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp; /* Lock the tty */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group */ pgrp = tty_get_pgrp(tty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); put_pid(pgrp); tty->winsize = *ws; done: mutex_unlock(&tty->winsize_mutex); return 0; } EXPORT_SYMBOL(tty_do_resize); /** * tiocswinsz - implement window size set ioctl * @tty: tty side of tty * @arg: user buffer for result * * Copies the user idea of the window size to the kernel. Traditionally this is * just advisory information but for the Linux console it actually has driver * level meaning and triggers a VC resize. * * Locking: * Driver dependent. The default do_resize method takes the tty termios * mutex and ctrl.lock. The console takes its own lock then calls into the * default method. */ static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg) { struct winsize tmp_ws; if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) return -EFAULT; if (tty->ops->resize) return tty->ops->resize(tty, &tmp_ws); else return tty_do_resize(tty, &tmp_ws); } /** * tioccons - allow admin to move logical console * @file: the file to become console * * Allow the administrator to move the redirected console device. * * Locking: uses redirect_lock to guard the redirect information */ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (file->f_op->write_iter == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); f = redirect; redirect = NULL; spin_unlock(&redirect_lock); if (f) fput(f); return 0; } if (file->f_op->write_iter != tty_write) return -ENOTTY; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; spin_lock(&redirect_lock); if (redirect) { spin_unlock(&redirect_lock); return -EBUSY; } redirect = get_file(file); spin_unlock(&redirect_lock); return 0; } /** * tiocsetd - set line discipline * @tty: tty device * @p: pointer to user data * * Set the line discipline according to user request. * * Locking: see tty_set_ldisc(), this function is just a helper */ static int tiocsetd(struct tty_struct *tty, int __user *p) { int disc; int ret; if (get_user(disc, p)) return -EFAULT; ret = tty_set_ldisc(tty, disc); return ret; } /** * tiocgetd - get line discipline * @tty: tty device * @p: pointer to user data * * Retrieves the line discipline id directly from the ldisc. * * Locking: waits for ldisc reference (in case the line discipline is changing * or the @tty is being hungup) */ static int tiocgetd(struct tty_struct *tty, int __user *p) { struct tty_ldisc *ld; int ret; ld = tty_ldisc_ref_wait(tty); if (!ld) return -EIO; ret = put_user(ld->ops->num, p); tty_ldisc_deref(ld); return ret; } /** * send_break - performed time break * @tty: device to break on * @duration: timeout in mS * * Perform a timed break on hardware that lacks its own driver level timed * break functionality. * * Locking: * @tty->atomic_write_lock serializes */ static int send_break(struct tty_struct *tty, unsigned int duration) { int retval; if (tty->ops->break_ctl == NULL) return 0; if (tty->driver->flags & TTY_DRIVER_HARDWARE_BREAK) return tty->ops->break_ctl(tty, duration); /* Do the work ourselves */ if (tty_write_lock(tty, false) < 0) return -EINTR; retval = tty->ops->break_ctl(tty, -1); if (!retval) { msleep_interruptible(duration); retval = tty->ops->break_ctl(tty, 0); } else if (retval == -EOPNOTSUPP) { /* some drivers can tell only dynamically */ retval = 0; } tty_write_unlock(tty); if (signal_pending(current)) retval = -EINTR; return retval; } /** * tty_get_tiocm - get tiocm status register * @tty: tty device * * Obtain the modem status bits from the tty driver if the feature * is supported. */ int tty_get_tiocm(struct tty_struct *tty) { int retval = -ENOTTY; if (tty->ops->tiocmget) retval = tty->ops->tiocmget(tty); return retval; } EXPORT_SYMBOL_GPL(tty_get_tiocm); /** * tty_tiocmget - get modem status * @tty: tty device * @p: pointer to result * * Obtain the modem status bits from the tty driver if the feature is * supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmget(struct tty_struct *tty, int __user *p) { int retval; retval = tty_get_tiocm(tty); if (retval >= 0) retval = put_user(retval, p); return retval; } /** * tty_tiocmset - set modem status * @tty: tty device * @cmd: command - clear bits, set bits or set all * @p: pointer to desired bits * * Set the modem status bits from the tty driver if the feature * is supported. Return -%ENOTTY if it is not available. * * Locking: none (up to the driver) */ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd, unsigned __user *p) { int retval; unsigned int set, clear, val; if (tty->ops->tiocmset == NULL) return -ENOTTY; retval = get_user(val, p); if (retval) return retval; set = clear = 0; switch (cmd) { case TIOCMBIS: set = val; break; case TIOCMBIC: clear = val; break; case TIOCMSET: set = val; clear = ~val; break; } set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; return tty->ops->tiocmset(tty, set, clear); } /** * tty_get_icount - get tty statistics * @tty: tty device * @icount: output parameter * * Gets a copy of the @tty's icount statistics. * * Locking: none (up to the driver) */ int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { memset(icount, 0, sizeof(*icount)); if (tty->ops->get_icount) return tty->ops->get_icount(tty, icount); else return -ENOTTY; } EXPORT_SYMBOL_GPL(tty_get_icount); static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) { struct serial_icounter_struct icount; int retval; retval = tty_get_icount(tty, &icount); if (retval != 0) return retval; if (copy_to_user(arg, &icount, sizeof(icount))) return -EFAULT; return 0; } static int tty_set_serial(struct tty_struct *tty, struct serial_struct *ss) { char comm[TASK_COMM_LEN]; int flags; flags = ss->flags & ASYNC_DEPRECATED; if (flags) pr_warn_ratelimited("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", __func__, get_task_comm(comm, current), flags); if (!tty->ops->set_serial) return -ENOTTY; return tty->ops->set_serial(tty, ss); } static int tty_tiocsserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; if (copy_from_user(&v, ss, sizeof(*ss))) return -EFAULT; return tty_set_serial(tty, &v); } static int tty_tiocgserial(struct tty_struct *tty, struct serial_struct __user *ss) { struct serial_struct v; int err; memset(&v, 0, sizeof(v)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err && copy_to_user(ss, &v, sizeof(v))) err = -EFAULT; return err; } /* * if pty, return the slave side (real_tty) * otherwise, return self */ static struct tty_struct *tty_pair_get_tty(struct tty_struct *tty) { if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) tty = tty->link; return tty; } /* * Split this up, as gcc can choke on it otherwise.. */ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_struct *real_tty; void __user *p = (void __user *)arg; int retval; struct tty_ldisc *ld; if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; real_tty = tty_pair_get_tty(tty); /* * Factor out some common prep work */ switch (cmd) { case TIOCSETD: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: retval = tty_check_change(tty); if (retval) return retval; if (cmd != TIOCCBRK) { tty_wait_until_sent(tty, 0); if (signal_pending(current)) return -EINTR; } break; } /* * Now do the stuff. */ switch (cmd) { case TIOCSTI: return tiocsti(tty, p); case TIOCGWINSZ: return tiocgwinsz(real_tty, p); case TIOCSWINSZ: return tiocswinsz(real_tty, p); case TIOCCONS: return real_tty != tty ? -EINVAL : tioccons(file); case TIOCEXCL: set_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNXCL: clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCGEXCL: { int excl = test_bit(TTY_EXCLUSIVE, &tty->flags); return put_user(excl, (int __user *)p); } case TIOCGETD: return tiocgetd(tty, p); case TIOCSETD: return tiocsetd(tty, p); case TIOCVHANGUP: if (!capable(CAP_SYS_ADMIN)) return -EPERM; tty_vhangup(tty); return 0; case TIOCGDEV: { unsigned int ret = new_encode_dev(tty_devnum(real_tty)); return put_user(ret, (unsigned int __user *)p); } /* * Break handling */ case TIOCSBRK: /* Turn break on, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, -1); return 0; case TIOCCBRK: /* Turn break off, unconditionally */ if (tty->ops->break_ctl) return tty->ops->break_ctl(tty, 0); return 0; case TCSBRK: /* SVID version: non-zero arg --> no break */ /* non-zero arg means wait for all output data * to be sent (performed above) but don't send break. * This is used by the tcdrain() termios function. */ if (!arg) return send_break(tty, 250); return 0; case TCSBRKP: /* support for POSIX tcsendbreak() */ return send_break(tty, arg ? arg*100 : 250); case TIOCMGET: return tty_tiocmget(tty, p); case TIOCMSET: case TIOCMBIC: case TIOCMBIS: return tty_tiocmset(tty, cmd, p); case TIOCGICOUNT: return tty_tiocgicount(tty, p); case TCFLSH: switch (arg) { case TCIFLUSH: case TCIOFLUSH: /* flush tty buffer and allow ldisc to process ioctl */ tty_buffer_flush(tty, NULL); break; } break; case TIOCSSERIAL: return tty_tiocsserial(tty, p); case TIOCGSERIAL: return tty_tiocgserial(tty, p); case TIOCGPTPEER: /* Special because the struct file is needed */ return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_ioctl(file, cmd, arg); retval = -EINVAL; if (ld->ops->ioctl) { retval = ld->ops->ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD) retval = -ENOTTY; } tty_ldisc_deref(ld); return retval; } #ifdef CONFIG_COMPAT struct serial_struct32 { compat_int_t type; compat_int_t line; compat_uint_t port; compat_int_t irq; compat_int_t flags; compat_int_t xmit_fifo_size; compat_int_t custom_divisor; compat_int_t baud_base; unsigned short close_delay; char io_type; char reserved_char; compat_int_t hub6; unsigned short closing_wait; /* time to wait before closing */ unsigned short closing_wait2; /* no longer used... */ compat_uint_t iomem_base; unsigned short iomem_reg_shift; unsigned int port_high; /* compat_ulong_t iomap_base FIXME */ compat_int_t reserved; }; static int compat_tty_tiocsserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; if (copy_from_user(&v32, ss, sizeof(*ss))) return -EFAULT; memcpy(&v, &v32, offsetof(struct serial_struct32, iomem_base)); v.iomem_base = compat_ptr(v32.iomem_base); v.iomem_reg_shift = v32.iomem_reg_shift; v.port_high = v32.port_high; v.iomap_base = 0; return tty_set_serial(tty, &v); } static int compat_tty_tiocgserial(struct tty_struct *tty, struct serial_struct32 __user *ss) { struct serial_struct32 v32; struct serial_struct v; int err; memset(&v, 0, sizeof(v)); memset(&v32, 0, sizeof(v32)); if (!tty->ops->get_serial) return -ENOTTY; err = tty->ops->get_serial(tty, &v); if (!err) { memcpy(&v32, &v, offsetof(struct serial_struct32, iomem_base)); v32.iomem_base = (unsigned long)v.iomem_base >> 32 ? 0xfffffff : ptr_to_compat(v.iomem_base); v32.iomem_reg_shift = v.iomem_reg_shift; v32.port_high = v.port_high; if (copy_to_user(ss, &v32, sizeof(v32))) err = -EFAULT; } return err; } static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; int retval = -ENOIOCTLCMD; switch (cmd) { case TIOCOUTQ: case TIOCSTI: case TIOCGWINSZ: case TIOCSWINSZ: case TIOCGEXCL: case TIOCGETD: case TIOCSETD: case TIOCGDEV: case TIOCMGET: case TIOCMSET: case TIOCMBIC: case TIOCMBIS: case TIOCGICOUNT: case TIOCGPGRP: case TIOCSPGRP: case TIOCGSID: case TIOCSERGETLSR: case TIOCGRS485: case TIOCSRS485: #ifdef TIOCGETP case TIOCGETP: case TIOCSETP: case TIOCSETN: #endif #ifdef TIOCGETC case TIOCGETC: case TIOCSETC: #endif #ifdef TIOCGLTC case TIOCGLTC: case TIOCSLTC: #endif case TCSETSF: case TCSETSW: case TCSETS: case TCGETS: #ifdef TCGETS2 case TCGETS2: case TCSETSF2: case TCSETSW2: case TCSETS2: #endif case TCGETA: case TCSETAF: case TCSETAW: case TCSETA: case TIOCGLCKTRMIOS: case TIOCSLCKTRMIOS: #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: #endif case TIOCGSOFTCAR: case TIOCSSOFTCAR: case PPPIOCGCHAN: case PPPIOCGUNIT: return tty_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); case TIOCCONS: case TIOCEXCL: case TIOCNXCL: case TIOCVHANGUP: case TIOCSBRK: case TIOCCBRK: case TCSBRK: case TCSBRKP: case TCFLSH: case TIOCGPTPEER: case TIOCNOTTY: case TIOCSCTTY: case TCXONC: case TIOCMIWAIT: case TIOCSERCONFIG: return tty_ioctl(file, cmd, arg); } if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl")) return -EINVAL; switch (cmd) { case TIOCSSERIAL: return compat_tty_tiocsserial(tty, compat_ptr(arg)); case TIOCGSERIAL: return compat_tty_tiocgserial(tty, compat_ptr(arg)); } if (tty->ops->compat_ioctl) { retval = tty->ops->compat_ioctl(tty, cmd, arg); if (retval != -ENOIOCTLCMD) return retval; } ld = tty_ldisc_ref_wait(tty); if (!ld) return hung_up_tty_compat_ioctl(file, cmd, arg); if (ld->ops->compat_ioctl) retval = ld->ops->compat_ioctl(tty, cmd, arg); if (retval == -ENOIOCTLCMD && ld->ops->ioctl) retval = ld->ops->ioctl(tty, (unsigned long)compat_ptr(cmd), arg); tty_ldisc_deref(ld); return retval; } #endif static int this_tty(const void *t, struct file *file, unsigned fd) { if (likely(file->f_op->read_iter != tty_read)) return 0; return file_tty(file) != t ? 0 : fd + 1; } /* * This implements the "Secure Attention Key" --- the idea is to * prevent trojan horses by killing all processes associated with this * tty when the user hits the "Secure Attention Key". Required for * super-paranoid applications --- see the Orange Book for more details. * * This code could be nicer; ideally it should send a HUP, wait a few * seconds, then send a INT, and then a KILL signal. But you then * have to coordinate with the init process, since all processes associated * with the current tty must be dead before the new getty is allowed * to spawn. * * Now, if it would be correct ;-/ The current code has a nasty hole - * it doesn't catch files in flight. We may send the descriptor to ourselves * via AF_UNIX socket, close it and later fetch from socket. FIXME. * * Nasty bug: do_SAK is being called in interrupt context. This can * deadlock. We punt it up to process context. AKPM - 16Mar2001 */ void __do_SAK(struct tty_struct *tty) { struct task_struct *g, *p; struct pid *session; int i; unsigned long flags; spin_lock_irqsave(&tty->ctrl.lock, flags); session = get_pid(tty->ctrl.session); spin_unlock_irqrestore(&tty->ctrl.lock, flags); tty_ldisc_flush(tty); tty_driver_flush_buffer(tty); read_lock(&tasklist_lock); /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { tty_notice(tty, "SAK: killed process %d (%s): by session\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the tty open */ for_each_process_thread(g, p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); continue; } task_lock(p); i = iterate_fd(p->files, 0, this_tty, tty); if (i != 0) { tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n", task_pid_nr(p), p->comm, i - 1); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } task_unlock(p); } read_unlock(&tasklist_lock); put_pid(session); } static void do_SAK_work(struct work_struct *work) { struct tty_struct *tty = container_of(work, struct tty_struct, SAK_work); __do_SAK(tty); } /* * The tq handling here is a little racy - tty->SAK_work may already be queued. * Fortunately we don't need to worry, because if ->SAK_work is already queued, * the values which we write to it will be identical to the values which it * already has. --akpm */ void do_SAK(struct tty_struct *tty) { if (!tty) return; schedule_work(&tty->SAK_work); } EXPORT_SYMBOL(do_SAK); /* Must put_device() after it's unused! */ static struct device *tty_get_device(struct tty_struct *tty) { dev_t devt = tty_devnum(tty); return class_find_device_by_devt(&tty_class, devt); } /** * alloc_tty_struct - allocate a new tty * @driver: driver which will handle the returned tty * @idx: minor of the tty * * This subroutine allocates and initializes a tty structure. * * Locking: none - @tty in question is not exposed at this point */ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) { struct tty_struct *tty; tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT); if (!tty) return NULL; kref_init(&tty->kref); if (tty_ldisc_init(tty)) { kfree(tty); return NULL; } tty->ctrl.session = NULL; tty->ctrl.pgrp = NULL; mutex_init(&tty->legacy_mutex); mutex_init(&tty->throttle_mutex); init_rwsem(&tty->termios_rwsem); mutex_init(&tty->winsize_mutex); init_ldsem(&tty->ldisc_sem); init_waitqueue_head(&tty->write_wait); init_waitqueue_head(&tty->read_wait); INIT_WORK(&tty->hangup_work, do_tty_hangup); mutex_init(&tty->atomic_write_lock); spin_lock_init(&tty->ctrl.lock); spin_lock_init(&tty->flow.lock); spin_lock_init(&tty->files_lock); INIT_LIST_HEAD(&tty->tty_files); INIT_WORK(&tty->SAK_work, do_SAK_work); tty->driver = driver; tty->ops = driver->ops; tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); return tty; } /** * tty_put_char - write one character to a tty * @tty: tty * @ch: character to write * * Write one byte to the @tty using the provided @tty->ops->put_char() method * if present. * * Note: the specific put_char operation in the driver layer may go * away soon. Don't call it directly, use this method * * Return: the number of characters successfully output. */ int tty_put_char(struct tty_struct *tty, u8 ch) { if (tty->ops->put_char) return tty->ops->put_char(tty, ch); return tty->ops->write(tty, &ch, 1); } EXPORT_SYMBOL_GPL(tty_put_char); static int tty_cdev_add(struct tty_driver *driver, dev_t dev, unsigned int index, unsigned int count) { int err; /* init here, since reused cdevs cause crashes */ driver->cdevs[index] = cdev_alloc(); if (!driver->cdevs[index]) return -ENOMEM; driver->cdevs[index]->ops = &tty_fops; driver->cdevs[index]->owner = driver->owner; err = cdev_add(driver->cdevs[index], dev, count); if (err) kobject_put(&driver->cdevs[index]->kobj); return err; } /** * tty_register_device - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to NULL safely. * * This call is required to be made to register an individual tty device * if the tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If * that bit is not set, this function should not be called by a tty * driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *device) { return tty_register_device_attr(driver, index, device, NULL, NULL); } EXPORT_SYMBOL(tty_register_device); static void tty_device_create_release(struct device *dev) { dev_dbg(dev, "releasing...\n"); kfree(dev); } /** * tty_register_device_attr - register a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * @device: a struct device that is associated with this tty device. * This field is optional, if there is no known struct device * for this tty device it can be set to %NULL safely. * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * This call is required to be made to register an individual tty device if the * tty driver's flags have the %TTY_DRIVER_DYNAMIC_DEV bit set. If that bit is * not set, this function should not be called by a tty driver. * * Locking: ?? * * Return: A pointer to the struct device for this tty device (or * ERR_PTR(-EFOO) on error). */ struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { char name[64]; dev_t devt = MKDEV(driver->major, driver->minor_start) + index; struct ktermios *tp; struct device *dev; int retval; if (index >= driver->num) { pr_err("%s: Attempt to register invalid tty line number (%d)\n", driver->name, index); return ERR_PTR(-EINVAL); } if (driver->type == TTY_DRIVER_TYPE_PTY) pty_line_name(driver, index, name); else tty_line_name(driver, index, name); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); dev->devt = devt; dev->class = &tty_class; dev->parent = device; dev->release = tty_device_create_release; dev_set_name(dev, "%s", name); dev->groups = attr_grp; dev_set_drvdata(dev, drvdata); dev_set_uevent_suppress(dev, 1); retval = device_register(dev); if (retval) goto err_put; if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { /* * Free any saved termios data so that the termios state is * reset when reusing a minor number. */ tp = driver->termios[index]; if (tp) { driver->termios[index] = NULL; kfree(tp); } retval = tty_cdev_add(driver, devt, index, 1); if (retval) goto err_del; } dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); return dev; err_del: device_del(dev); err_put: put_device(dev); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(tty_register_device_attr); /** * tty_unregister_device - unregister a tty device * @driver: the tty driver that describes the tty device * @index: the index in the tty driver for this tty device * * If a tty device is registered with a call to tty_register_device() then * this function must be called when the tty device is gone. * * Locking: ?? */ void tty_unregister_device(struct tty_driver *driver, unsigned index) { device_destroy(&tty_class, MKDEV(driver->major, driver->minor_start) + index); if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { cdev_del(driver->cdevs[index]); driver->cdevs[index] = NULL; } } EXPORT_SYMBOL(tty_unregister_device); /** * __tty_alloc_driver - allocate tty driver * @lines: count of lines this driver can handle at most * @owner: module which is responsible for this driver * @flags: some of %TTY_DRIVER_ flags, will be set in driver->flags * * This should not be called directly, some of the provided macros should be * used instead. Use IS_ERR() and friends on @retval. */ struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags) { struct tty_driver *driver; unsigned int cdevs = 1; int err; if (!lines || (flags & TTY_DRIVER_UNNUMBERED_NODE && lines > 1)) return ERR_PTR(-EINVAL); driver = kzalloc(sizeof(*driver), GFP_KERNEL); if (!driver) return ERR_PTR(-ENOMEM); kref_init(&driver->kref); driver->num = lines; driver->owner = owner; driver->flags = flags; if (!(flags & TTY_DRIVER_DEVPTS_MEM)) { driver->ttys = kcalloc(lines, sizeof(*driver->ttys), GFP_KERNEL); driver->termios = kcalloc(lines, sizeof(*driver->termios), GFP_KERNEL); if (!driver->ttys || !driver->termios) { err = -ENOMEM; goto err_free_all; } } if (!(flags & TTY_DRIVER_DYNAMIC_ALLOC)) { driver->ports = kcalloc(lines, sizeof(*driver->ports), GFP_KERNEL); if (!driver->ports) { err = -ENOMEM; goto err_free_all; } cdevs = lines; } driver->cdevs = kcalloc(cdevs, sizeof(*driver->cdevs), GFP_KERNEL); if (!driver->cdevs) { err = -ENOMEM; goto err_free_all; } return driver; err_free_all: kfree(driver->ports); kfree(driver->ttys); kfree(driver->termios); kfree(driver->cdevs); kfree(driver); return ERR_PTR(err); } EXPORT_SYMBOL(__tty_alloc_driver); static void destruct_tty_driver(struct kref *kref) { struct tty_driver *driver = container_of(kref, struct tty_driver, kref); int i; struct ktermios *tp; if (driver->flags & TTY_DRIVER_INSTALLED) { for (i = 0; i < driver->num; i++) { tp = driver->termios[i]; if (tp) { driver->termios[i] = NULL; kfree(tp); } if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) tty_unregister_device(driver, i); } proc_tty_unregister_driver(driver); if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) cdev_del(driver->cdevs[0]); } kfree(driver->cdevs); kfree(driver->ports); kfree(driver->termios); kfree(driver->ttys); kfree(driver); } /** * tty_driver_kref_put - drop a reference to a tty driver * @driver: driver of which to drop the reference * * The final put will destroy and free up the driver. */ void tty_driver_kref_put(struct tty_driver *driver) { kref_put(&driver->kref, destruct_tty_driver); } EXPORT_SYMBOL(tty_driver_kref_put); /** * tty_register_driver - register a tty driver * @driver: driver to register * * Called by a tty driver to register itself. */ int tty_register_driver(struct tty_driver *driver) { int error; int i; dev_t dev; struct device *d; if (!driver->major) { error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, driver->name); if (!error) { driver->major = MAJOR(dev); driver->minor_start = MINOR(dev); } } else { dev = MKDEV(driver->major, driver->minor_start); error = register_chrdev_region(dev, driver->num, driver->name); } if (error < 0) goto err; if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) goto err_unreg_char; } mutex_lock(&tty_mutex); list_add(&driver->tty_drivers, &tty_drivers); mutex_unlock(&tty_mutex); if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) { for (i = 0; i < driver->num; i++) { d = tty_register_device(driver, i, NULL); if (IS_ERR(d)) { error = PTR_ERR(d); goto err_unreg_devs; } } } proc_tty_register_driver(driver); driver->flags |= TTY_DRIVER_INSTALLED; return 0; err_unreg_devs: for (i--; i >= 0; i--) tty_unregister_device(driver, i); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); err_unreg_char: unregister_chrdev_region(dev, driver->num); err: return error; } EXPORT_SYMBOL(tty_register_driver); /** * tty_unregister_driver - unregister a tty driver * @driver: driver to unregister * * Called by a tty driver to unregister itself. */ void tty_unregister_driver(struct tty_driver *driver) { unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), driver->num); mutex_lock(&tty_mutex); list_del(&driver->tty_drivers); mutex_unlock(&tty_mutex); } EXPORT_SYMBOL(tty_unregister_driver); dev_t tty_devnum(struct tty_struct *tty) { return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; } EXPORT_SYMBOL(tty_devnum); void tty_default_fops(struct file_operations *fops) { *fops = tty_fops; } static char *tty_devnode(const struct device *dev, umode_t *mode) { if (!mode) return NULL; if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) || dev->devt == MKDEV(TTYAUX_MAJOR, 2)) *mode = 0666; return NULL; } const struct class tty_class = { .name = "tty", .devnode = tty_devnode, }; static int __init tty_class_init(void) { return class_register(&tty_class); } postcore_initcall(tty_class_init); /* 3/2004 jmc: why do these devices exist? */ static struct cdev tty_cdev, console_cdev; static ssize_t show_cons_active(struct device *dev, struct device_attribute *attr, char *buf) { struct console *cs[16]; int i = 0; struct console *c; ssize_t count = 0; /* * Hold the console_list_lock to guarantee that no consoles are * unregistered until all console processing is complete. * This also allows safe traversal of the console list and * race-free reading of @flags. */ console_list_lock(); for_each_console(c) { if (!c->device) continue; if (!(c->flags & CON_NBCON) && !c->write) continue; if ((c->flags & CON_ENABLED) == 0) continue; cs[i++] = c; if (i >= ARRAY_SIZE(cs)) break; } /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); while (i--) { int index = cs[i]->index; struct tty_driver *drv = cs[i]->device(cs[i], &index); /* don't resolve tty0 as some programs depend on it */ if (drv && (cs[i]->index > 0 || drv->major != TTY_MAJOR)) count += tty_line_name(drv, index, buf + count); else count += sprintf(buf + count, "%s%d", cs[i]->name, cs[i]->index); count += sprintf(buf + count, "%c", i ? ' ':'\n'); } console_unlock(); console_list_unlock(); return count; } static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL); static struct attribute *cons_dev_attrs[] = { &dev_attr_active.attr, NULL }; ATTRIBUTE_GROUPS(cons_dev); static struct device *consdev; void console_sysfs_notify(void) { if (consdev) sysfs_notify(&consdev->kobj, NULL, "active"); } static struct ctl_table tty_table[] = { { .procname = "legacy_tiocsti", .data = &tty_legacy_tiocsti, .maxlen = sizeof(tty_legacy_tiocsti), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "ldisc_autoload", .data = &tty_ldisc_autoload, .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; /* * Ok, now we can initialize the rest of the tty devices and can count * on memory allocations, interrupts etc.. */ int __init tty_init(void) { register_sysctl_init("dev/tty", tty_table); cdev_init(&tty_cdev, &tty_fops); if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) panic("Couldn't register /dev/tty driver\n"); device_create(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); cdev_init(&console_cdev, &console_fops); if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) panic("Couldn't register /dev/console driver\n"); consdev = device_create_with_groups(&tty_class, NULL, MKDEV(TTYAUX_MAJOR, 1), NULL, cons_dev_groups, "console"); if (IS_ERR(consdev)) consdev = NULL; #ifdef CONFIG_VT vty_init(&console_fops); #endif return 0; } |
28 28 80 78 395 395 185 24 24 24 392 2 397 239 236 277 6 22 137 138 138 136 138 443 440 443 444 441 1 187 187 61 186 186 79 80 80 78 61 19 2 2 80 78 76 4 32 32 32 29 14 10 2 1 2 11 1 28 11 4 1 11 10 6 5 102 100 2 100 103 4 4 4 4 3 3 2 1 11 1 1 1 8 8 7 5 1 3 1 4 1 14 14 1 10 19 1 1 1 1 16 4 2 4 16 16 6 10 16 16 14 2 2 14 10 6 15 1 16 12 2 22 24 1 17 16 13 11 1 1 2 2 2 13 3 11 2 1 2 48 46 1 33 15 25 2 43 44 1 42 1 2 1 17 43 2 26 5 10 14 14 13 1 3 1 8 3 7 1 11 11 1 1 1 1 1 5 4 1 3 3 1 3 1 1 1 1 10 10 10 9 120 121 121 121 120 20 109 8 115 10 4 10 1 10 121 31 3 16 9 1 1 3 878 877 878 536 440 8 1 1 660 1 181 3 214 3 198 27 26 101 187 16 16 111 111 1 110 108 1 6 109 111 103 15 3 108 108 1 2 106 6 111 4 1 3 3 4 11 1 13 1 10 6 6 4 6 23 23 4 19 1 1 1 1 1 5 5 3 5 23 23 105 104 105 839 836 889 887 182 43 8 8 1 1 5 1 1 5 5 1 1 3 2 548 547 544 546 547 564 2 1 200 481 473 474 479 471 476 8 549 550 549 1 9 7 14 10 4 1 1 1 1 1 1 3 2 1 3 13 11 1 1 12 12 12 6 12 1 2 2 12 460 458 202 460 458 202 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 IP device support routines. * * Derived from the IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr * lists. * Cyrus Durgin: updated for kmod * Matthias Andree: in devinet_ioctl, compare label and * address (4.4BSD alias style support), * fall back to comparing just the label * if no match found. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/kmod.h> #include <linux/netconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/addrconf.h> #define IPV6ONLY_FLAGS \ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; #define IPV4_DEVCONF_DFLT(net, attr) \ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFA_PROTO] = { .type = NLA_U8 }, }; struct inet_fill_args { u32 portid; u32 seq; int event; unsigned int flags; int netnsid; int ifindex; }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); ASSERT_RTNL(); hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); hlist_del_init_rcu(&ifa->addr_lst); } /** * __ip_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address * @devref: if true, take a reference on the found device * * If a caller uses devref=false, it should be protected by RCU, or RTNL */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); ifa = inet_lookup_ifaddr_rcu(net, addr); if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; /* Fallback to FIB local table so that communication * over loopback subnets work. */ local = fib_get_table(net, RT_TABLE_LOCAL); if (local && !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); } else { result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL(__ip_dev_find); /* called under RCU lock */ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) { u32 hash = inet_addr_hash(net, addr); struct in_ifaddr *ifa; hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) if (ifa->ifa_local == addr) return ifa; return NULL; } static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else static int devinet_sysctl_register(struct in_device *idev) { return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { } #endif /* Locks all the inet devices. */ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { struct in_ifaddr *ifa; ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); if (!ifa) return NULL; in_dev_hold(in_dev); ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); in_dev_put(ifa->ifa_dev); kfree(ifa); } static void inet_free_ifa(struct in_ifaddr *ifa) { /* Our reference to ifa->ifa_dev must be freed ASAP * to release the reference to the netdev the same way. * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() */ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) { struct in_device *idev = container_of(head, struct in_device, rcu_head); kfree(rcu_dereference_protected(idev->mc_hash, 1)); kfree(idev); } void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else call_rcu(&idev->rcu_head, in_dev_free_rcu); } EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; int err = -ENOMEM; ASSERT_RTNL(); in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); if (!in_dev) goto out; memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) dev_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); if (dev != blackhole_netdev) { err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); } /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; goto out; } static void inetdev_destroy(struct in_device *in_dev) { struct net_device *dev; struct in_ifaddr *ifa; ASSERT_RTNL(); dev = in_dev->dev; in_dev->dead = 1; ip_mc_destroy_dev(in_dev); while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } RCU_INIT_POINTER(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); in_dev_put(in_dev); } static int __init inet_blackhole_dev_init(void) { int err = 0; rtnl_lock(); if (!inetdev_init(blackhole_netdev)) err = -ENOMEM; rtnl_unlock(); return err; } late_initcall(inet_blackhole_dev_init); int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); last_prim = ifap; if (in_dev->dead) goto no_promotions; /* 1. Deleting primary ifaddr forces deletion all secondaries * unless alias promotion is set **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; prev_prom = ifa; continue; } if (!do_promote) { inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } else { promote = ifa; break; } } } /* On promotion all secondaries from subnet are changing * the primary IP, we must remove all their routes silently * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); } no_promotions: /* 2. Unlink it */ *ifap = ifa1->ifa_next; inet_hash_remove(ifa1); /* 3. Announce address deletion */ /* Send message first, then call notifier. At first sight, FIB update triggered by notifier will refer to already deleted ifaddr, that could confuse netlink listeners. It is not true: look, gated sees that route deleted and if it still thinks that ifaddr is valid, it will try to restore deleted routes... Grr. So that, this order is correct. */ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { struct in_ifaddr *next_sec; next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { struct in_ifaddr *last_sec; rcu_assign_pointer(prev_prom->ifa_next, next_sec); last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); for (ifa = next_sec; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; fib_add_ifaddr(ifa); } } if (destroy) inet_free_ifa(ifa1); } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; /* Don't set IPv6 only flags to IPv4 addresses */ ifa->ifa_flags &= ~IPV6ONLY_FLAGS; ifap = &in_dev->ifa_list; ifa1 = rtnl_dereference(*ifap); while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { if (ifa1->ifa_local == ifa->ifa_local) { inet_free_ifa(ifa); return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } ifa->ifa_flags |= IFA_F_SECONDARY; } ifap = &ifa1->ifa_next; ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh * in now, before changes are committed. The rntl lock is serializing * access here, so the state should not change between a validator call * and a final notify on commit. This isn't invoked on promotion under * the assumption that validators are checking the address itself, and * not the flags. */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); if (ret) { inet_free_ifa(ifa); return ret; } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) ifap = last_primary; rcu_assign_pointer(ifa->ifa_next, *ifap); rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); return 0; } static int inet_insert_ifa(struct in_ifaddr *ifa) { if (!ifa->ifa_local) { inet_free_ifa(ifa); return 0; } return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } /* Caller must hold RCU or RTNL : * We dont take a reference on found in_device */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { struct in_ifaddr *ifa; ASSERT_RTNL(); in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; } return NULL; } static int ip_mc_autojoin_config(struct net *net, bool join, const struct in_ifaddr *ifa) { #if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL_NET(net); lock_sock(sk); if (join) ret = ip_mc_join_group(sk, &mreq); else ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); return ret; #else return -EOPNOTSUPP; #endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) goto out; ifm = nlmsg_data(nlh); rtnl_net_lock(net); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto unlock; } for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) continue; if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; unlock: rtnl_net_unlock(net); out: return err; } static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; struct net *net; int i; net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; u32 flags; flags = READ_ONCE(ifa->ifa_flags); if (flags & IFA_F_PERMANENT) continue; preferred_lft = READ_ONCE(ifa->ifa_preferred_lft); valid_lft = READ_ONCE(ifa->ifa_valid_lft); tstamp = READ_ONCE(ifa->ifa_tstamp); /* We try to batch several events at once. */ age = (now - tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (valid_lft != INFINITY_LIFE_TIME && age >= valid_lft) { change_needed = true; } else if (preferred_lft == INFINITY_LIFE_TIME) { continue; } else if (age >= preferred_lft) { if (time_before(tstamp + valid_lft * HZ, next)) next = tstamp + valid_lft * HZ; if (!(flags & IFA_F_DEPRECATED)) change_needed = true; } else if (time_before(tstamp + preferred_lft * HZ, next)) { next = tstamp + preferred_lft * HZ; } } rcu_read_unlock(); if (!change_needed) continue; rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) continue; /* We try to batch several events at once. */ age = (now - ifa->ifa_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { struct in_ifaddr __rcu **ifap; struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } ifap = &tmp->ifa_next; tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_preferred_lft && !(ifa->ifa_flags & IFA_F_DEPRECATED)) { ifa->ifa_flags |= IFA_F_DEPRECATED; rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); next_sched = next; /* If rounded timeout is accurate enough, accept it. */ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) next_sched = next_sec; now = jiffies; /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, __u32 prefered_lft) { unsigned long timeout; u32 flags; flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) WRITE_ONCE(ifa->ifa_valid_lft, timeout); else flags |= IFA_F_PERMANENT; timeout = addrconf_timeout_fixup(prefered_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) flags |= IFA_F_DEPRECATED; WRITE_ONCE(ifa->ifa_preferred_lft, timeout); } WRITE_ONCE(ifa->ifa_flags, flags); WRITE_ONCE(ifa->ifa_tstamp, jiffies); if (!ifa->ifa_cstamp) WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack, __u32 *valid_lft, __u32 *prefered_lft) { struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); return -EINVAL; } if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); return -EINVAL; } *valid_lft = ci->ifa_valid; *prefered_lft = ci->ifa_prefered; } return 0; } static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifaddrmsg *ifm = nlmsg_data(nlh); struct in_device *in_dev; struct net_device *dev; struct in_ifaddr *ifa; int err; dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; } in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays * assigned to its device and is destroy with it. */ goto errout; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); return ifa; errout: return ERR_PTR(err); } static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } return NULL; } static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { __u32 prefered_lft = INFINITY_LIFE_TIME; __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa_existing; struct nlattr *tb[IFA_MAX + 1]; struct in_ifaddr *ifa; int ret; ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; rtnl_net_lock(net); ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) { ret = PTR_ERR(ifa); goto unlock; } ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); goto unlock; } } ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); ret = -EEXIST; goto unlock; } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { fib_modify_prefix_metric(ifa, new_metric); ifa->ifa_rt_priority = new_metric; } ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } unlock: rtnl_net_unlock(net); return ret; } /* * Determine a default network mask, based on the IP address. */ static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) rc = 0; else { __u32 haddr = ntohl(addr); if (IN_CLASSA(haddr)) rc = 8; else if (IN_CLASSB(haddr)) rc = 16; else if (IN_CLASSC(haddr)) rc = 24; else if (IN_CLASSE(haddr)) rc = 32; } return rc; } int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; int ret = -EFAULT; int tryaddrmatch = 0; ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ case SIOCGIFNETMASK: /* Get the netmask for the interface */ /* Note that these ioctls will not sleep, so that we do not impose a lock. One day we will be forced to put shlock here (I mean SMP) */ tryaddrmatch = (sin_orig.sin_family == AF_INET); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; break; case SIOCSIFFLAGS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) goto out; break; default: ret = -EINVAL; goto out; } rtnl_net_lock(net); ret = -ENODEV; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; if (colon) *colon = ':'; in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ } } } /* we didn't get a match, maybe the application is 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } ret = -EADDRNOTAVAIL; if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { case SIOCGIFADDR: /* Get interface address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; break; case SIOCGIFBRDADDR: /* Get the broadcast address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; break; case SIOCGIFDSTADDR: /* Get the destination address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; break; case SIOCSIFFLAGS: if (colon) { ret = -EADDRNOTAVAIL; if (!ifa) break; ret = 0; if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */ ASSERT_RTNL(); ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; if (!ifa) { ret = -ENOBUFS; if (!in_dev) break; ifa = inet_alloc_ifa(in_dev); if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { ret = 0; if (ifa->ifa_local == sin->sin_addr.s_addr) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_scope = 0; } ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; if (!(dev->flags & IFF_POINTOPOINT)) { ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); if ((dev->flags & IFF_BROADCAST) && ifa->ifa_prefixlen < 31) ifa->ifa_broadcast = ifa->ifa_address | ~ifa->ifa_mask; } else { ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ ret = 0; if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = sin->sin_addr.s_addr; inet_insert_ifa(ifa); } break; case SIOCSIFDSTADDR: /* Set the destination address */ ret = 0; if (ifa->ifa_address == sin->sin_addr.s_addr) break; ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; ret = 0; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_address = sin->sin_addr.s_addr; inet_insert_ifa(ifa); break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ /* * The mask we set must be legal. */ ret = -EINVAL; if (bad_mask(sin->sin_addr.s_addr, 0)) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { __be32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); /* See if current broadcast address matches * with current netmask, then recalculate * the broadcast address. Otherwise it's a * funny address, so don't touch it since * the user seems to know what (s)he's doing... */ if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } inet_insert_ifa(ifa); } break; } done: rtnl_net_unlock(net); out: return ret; } int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; if (WARN_ON(size > sizeof(struct ifreq))) goto out; if (!in_dev) goto out; in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) { if (!buf) { done += size; continue; } if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } len -= size; done += size; } out: return done; } static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { const struct in_ifaddr *ifa; __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net = dev_net(dev); int master_idx; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; break; } if (!addr) addr = ifa->ifa_local; } if (addr) goto out_unlock; no_in_dev: master_idx = l3mdev_master_ifindex_rcu(dev); /* For VRFs, the VRF device takes the place of the loopback device, * with addresses on it being preferred. Note in such cases the * loopback device will be among the devices that fail the master_idx * equality check in the loop below. */ if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { if (l3mdev_master_ifindex_rcu(dev) != master_idx) continue; in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } out_unlock: rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { unsigned char localnet_scope = RT_SCOPE_HOST; const struct in_ifaddr *ifa; __be32 addr = 0; int same = 0; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); if (!addr && (local == ifa->ifa_local || !local) && min_scope <= scope) { addr = ifa->ifa_local; if (same) break; } if (!same) { same = (!local || inet_ifa_match(local, ifa)) && (!dst || inet_ifa_match(dst, ifa)); if (same && addr) { if (local || !dst) break; /* Is the selected addr into dst subnet? */ if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ if (min_scope <= scope) { addr = ifa->ifa_local; break; } /* search for large dst subnet for addr */ same = 0; } } } return same ? addr : 0; } /* * Confirm that local IP address exists using wildcards: * - net: netns to check, cannot be NULL * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; if (in_dev) return confirm_addr_indev(in_dev, dst, local, scope); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_confirm_addr); /* * Device notifier */ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_notifier); int register_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(register_inetaddr_validator_notifier); int unregister_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_validator_notifier); /* Rename ifa_labels for a device name change. Make some effort to preserve * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { struct in_ifaddr *ifa; int named = 0; in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) goto skip; dot = strchr(old, ':'); if (!dot) { sprintf(old, ":%d", named); dot = old; } if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, dev->dev_addr, NULL); } } /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); if (IS_ERR(in_dev)) return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } } else if (event == NETDEV_CHANGEMTU) { /* Re-enabling IP */ if (inetdev_valid_mtu(dev->mtu)) in_dev = inetdev_init(dev); } goto out; } switch (event) { case NETDEV_REGISTER: pr_debug("%s: bug\n", __func__); RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } ip_mc_up(in_dev); fallthrough; case NETDEV_CHANGEADDR: if (!IN_DEV_ARP_NOTIFY(in_dev)) break; fallthrough; case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); break; case NETDEV_PRE_TYPE_CHANGE: ip_mc_unmap(in_dev); break; case NETDEV_POST_TYPE_CHANGE: ip_mc_remap(in_dev); break; case NETDEV_CHANGEMTU: if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ fallthrough; case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; case NETDEV_CHANGENAME: /* Do not notify about label change, this event is * not interesting to applications using netlink. */ inetdev_changename(dev, in_dev); devinet_sysctl_unregister(in_dev); devinet_sysctl_register(in_dev); break; } out: return NOTIFY_DONE; } static struct notifier_block ip_netdev_notifier = { .notifier_call = inetdev_event, }; static size_t inet_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, unsigned long tstamp, u32 preferred, u32 valid) { struct ifa_cacheinfo ci; ci.cstamp = cstamp_delta(cstamp); ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa, struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; unsigned long tstamp; u32 preferred, valid; u32 flags; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; flags = READ_ONCE(ifa->ifa_flags); /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits. * The 32bit value is given in IFA_FLAGS attribute. */ ifm->ifa_flags = (__u8)flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; if (args->netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto nla_put_failure; tstamp = READ_ONCE(ifa->ifa_tstamp); if (!(flags & IFA_F_PERMANENT)) { preferred = READ_ONCE(ifa->ifa_preferred_lft); valid = READ_ONCE(ifa->ifa_valid_lft); if (preferred != INFINITY_LIFE_TIME) { long tval = (jiffies - tstamp) / HZ; if (preferred > tval) preferred -= tval; else preferred = 0; if (valid != INFINITY_LIFE_TIME) { if (valid > tval) valid -= tval; else valid = 0; } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || (ifa->ifa_proto && nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp, preferred, valid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct inet_fill_args *fillargs, struct net **tgt_net, struct sock *sk, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[IFA_MAX+1]; struct ifaddrmsg *ifm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; } fillargs->ifindex = ifm->ifa_index; if (fillargs->ifindex) { cb->answer_flags |= NLM_F_DUMP_FILTERED; fillargs->flags |= NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= IFA_MAX; ++i) { if (!tb[i]) continue; if (i == IFA_TARGET_NETNSID) { struct net *net; fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } *tgt_net = net; } else { NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; int err; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. */ if (!res) res = 0x80000000; return res; } static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, .event = RTM_NEWADDR, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct { unsigned long ifindex; int ip_idx; } *ctx = (void *)cb->ctx; struct in_device *in_dev; struct net_device *dev; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) goto done; if (fillargs.ifindex) { dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; goto done; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); goto done; } } cb->seq = inet_base_seq(tgt_net); for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); if (err < 0) goto done; } done: if (fillargs.netnsid >= 0) put_net(tgt_net); rcu_read_unlock(); return err; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { struct inet_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .event = event, .flags = 0, .netnsid = -1, }; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); if (!skb) goto errout; err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; if (!in_dev) return -ENODATA; nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); if (!nla) return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]); return 0; } static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { [IFLA_INET_CONF] = { .type = NLA_NESTED }, }; static int inet_validate_link_af(const struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, inet_af_policy, extack); if (err < 0) return err; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { int cfgid = nla_type(a); if (nla_len(a) < 4) return -EINVAL; if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) return -EINVAL; } } return 0; } static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; if (!in_dev) return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) return -EINVAL; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); } return 0; } static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_BC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; } static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, const struct ipv4_devconf *devconf, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; if (!devconf) goto out; if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_BC_FORWARDING) && nla_put_s32(skb, NETCONFA_BC_FORWARDING, IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_RO(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; const struct ipv4_devconf *devconf; struct in_device *in_dev = NULL; struct net_device *dev = NULL; struct sk_buff *skb; int ifindex; int err; err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err) return err; if (!tb[NETCONFA_IFINDEX]) return -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: devconf = net->ipv4.devconf_all; break; case NETCONFA_IFINDEX_DEFAULT: devconf = net->ipv4.devconf_dflt; break; default: err = -ENODEV; dev = dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); if (!in_dev) goto errout; devconf = &in_dev->cnf; break; } err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: if (in_dev) in_dev_put(in_dev); dev_put(dev); return err; } static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; unsigned int all_default; } *ctx = (void *)cb->ctx; const struct in_device *in_dev; struct net_device *dev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; } if (ctx->all_default == 0) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } if (ctx->all_default == 1) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } done: rcu_read_unlock(); return err; } #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; } rcu_read_unlock(); } /* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; int on = IPV4_DEVCONF_ALL(net, FORWARDING); IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) dev_disable_lro(dev); in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } } static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) { if (cnf == net->ipv4.devconf_dflt) return NETCONFA_IFINDEX_DEFAULT; else if (cnf == net->ipv4.devconf_all) return NETCONFA_IFINDEX_ALL; else { struct in_device *idev = container_of(cnf, struct in_device, cnf); return idev->dev->ifindex; } } static int devinet_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; int ifindex; set_bit(i, cnf->state); if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && new_value != old_value) rt_cache_flush(net); if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } return ret; } static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; struct net *net = ctl->extra2; int ret; if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; return restart_syscall(); } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else { struct ipv4_devconf *cnf = ctl->extra1; struct in_device *idev = container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); } rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } return ret; } static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) rt_cache_flush(net); return ret; } #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ .procname = name, \ .data = ipv4_devconf.data + \ IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) #define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER, "arp_evict_nocarrier"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, "drop_unicast_in_l2_multicast"), }, }; static int __devinet_sysctl_register(struct net *net, char *dev_name, int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; } snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); if (!t->sysctl_header) goto free; p->sysctl = t; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, ifindex, p); return 0; free: kfree(t); out: return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; if (t) { cnf->sysctl = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) { int err; if (!sysctl_dev_name_is_allowed(idev->dev->name)) return -EINVAL; err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; } static void devinet_sysctl_unregister(struct in_device *idev) { struct net *net = dev_net(idev->dev); __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } static struct ctl_table ctl_forward_entry[] = { { .procname = "ip_forward", .data = &ipv4_devconf.data[ IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, }; #endif static __net_init int devinet_init_net(struct net *net) { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table *tbl; #endif struct ipv4_devconf *all, *dflt; int err; int i; err = -ENOMEM; net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->ipv4.inet_addr_lst) goto err_alloc_hash; all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); if (!dflt) goto err_alloc_dflt; #ifdef CONFIG_SYSCTL tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); if (!tbl) goto err_alloc_ctl; tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif if (!net_eq(net, &init_net)) { switch (net_inherit_devconf()) { case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 0: case 1: /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 2: /* use compiled values */ break; } } #ifdef CONFIG_SYSCTL err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; err = __devinet_sysctl_register(net, "default", NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; err = -ENOMEM; forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl, ARRAY_SIZE(ctl_forward_entry)); if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; #endif for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; #ifdef CONFIG_SYSCTL err_reg_ctl: __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(tbl); err_alloc_ctl: #endif kfree(dflt); err_alloc_dflt: kfree(all); err_alloc_all: kfree(net->ipv4.inet_addr_lst); err_alloc_hash: return err; } static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; #endif cancel_delayed_work_sync(&net->ipv4.addr_chk_work); #ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, NETCONFA_IFINDEX_DEFAULT); __devinet_sysctl_unregister(net, net->ipv4.devconf_all, NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { .family = AF_INET, .fill_link_af = inet_fill_link_af, .get_link_af_size = inet_get_link_af_size, .validate_link_af = inet_validate_link_af, .set_link_af = inet_set_link_af, }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) { register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); if (rtnl_af_register(&inet_af_ops)) panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); } |
380 6 254 75 65 100 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> #include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_suspend_irqs(unsigned int napi_id); void napi_resume_irqs(unsigned int napi_id); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (skb->napi_id < MIN_NAPI_ID) skb->napi_id = napi->napi_id; #endif } /* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_update(sk, skb); } /* Variant of sk_mark_napi_id() for passive flow setup, * as sk->sk_napi_id and sk->sk_rx_queue_mapping content * needs to be set. */ static inline void sk_mark_napi_id_set(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, skb->napi_id); #endif } static inline void sk_mark_napi_id_once_xdp(struct sock *sk, const struct xdp_buff *xdp) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> #include <asm/cpufeature.h> #include <asm/msr.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ #define get_debugreg(var, register) \ (var) = native_get_debugreg(register) #define set_debugreg(value, register) \ native_set_debugreg(register, value) #endif static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ switch (regno) { case 0: asm("mov %%db0, %0" :"=r" (val)); break; case 1: asm("mov %%db1, %0" :"=r" (val)); break; case 2: asm("mov %%db2, %0" :"=r" (val)); break; case 3: asm("mov %%db3, %0" :"=r" (val)); break; case 6: asm("mov %%db6, %0" :"=r" (val)); break; case 7: /* * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception * when running under SEV-ES. Taking a #VC exception is not a * safe thing to do just anywhere in the entry code and * re-ordering might place the access into an unsafe location. * * This happened in the NMI handler, where the DR7 read was * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); } return val; } static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: asm("mov %0, %%db0" ::"r" (value)); break; case 1: asm("mov %0, %%db1" ::"r" (value)); break; case 2: asm("mov %0, %%db2" ::"r" (value)); break; case 3: asm("mov %0, %%db3" ::"r" (value)); break; case 6: asm("mov %0, %%db6" ::"r" (value)); break; case 7: /* * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the * __FORCE_ORDER here too to avoid similar problems in the * future. */ asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); } } static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ set_debugreg(0UL, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); set_debugreg(0UL, 1); set_debugreg(0UL, 2); set_debugreg(0UL, 3); } static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) { unsigned long dr7; if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) return 0; get_debugreg(dr7, 7); dr7 &= ~0x400; /* architecturally set bit */ if (dr7) set_debugreg(0, 7); /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not * be good. */ barrier(); return dr7; } static __always_inline void local_db_restore(unsigned long dr7) { /* * Ensure the compiler doesn't raise this statement into * the critical section; enabling breakpoints early would * not be good. */ barrier(); if (dr7) set_debugreg(dr7, 7); } #ifdef CONFIG_CPU_SUP_AMD extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) { return 0; } #endif static inline unsigned long get_debugctlmsr(void) { unsigned long debugctlmsr = 0; #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return 0; #endif rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); return debugctlmsr; } static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return; #endif wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } #endif /* _ASM_X86_DEBUGREG_H */ |
3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/virtio.h> #include <linux/spinlock.h> #include <linux/virtio_config.h> #include <linux/virtio_anchor.h> #include <linux/module.h> #include <linux/idr.h> #include <linux/of.h> #include <uapi/linux/virtio_ids.h> /* Unique numbering for virtio devices. */ static DEFINE_IDA(virtio_index_ida); static ssize_t device_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%04x\n", dev->id.device); } static DEVICE_ATTR_RO(device); static ssize_t vendor_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%04x\n", dev->id.vendor); } static DEVICE_ATTR_RO(vendor); static ssize_t status_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "0x%08x\n", dev->config->get_status(dev)); } static DEVICE_ATTR_RO(status); static ssize_t modalias_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); return sysfs_emit(buf, "virtio:d%08Xv%08X\n", dev->id.device, dev->id.vendor); } static DEVICE_ATTR_RO(modalias); static ssize_t features_show(struct device *_d, struct device_attribute *attr, char *buf) { struct virtio_device *dev = dev_to_virtio(_d); unsigned int i; ssize_t len = 0; /* We actually represent this as a bitstring, as it could be * arbitrary length in future. */ for (i = 0; i < sizeof(dev->features)*8; i++) len += sysfs_emit_at(buf, len, "%c", __virtio_test_bit(dev, i) ? '1' : '0'); len += sysfs_emit_at(buf, len, "\n"); return len; } static DEVICE_ATTR_RO(features); static struct attribute *virtio_dev_attrs[] = { &dev_attr_device.attr, &dev_attr_vendor.attr, &dev_attr_status.attr, &dev_attr_modalias.attr, &dev_attr_features.attr, NULL, }; ATTRIBUTE_GROUPS(virtio_dev); static inline int virtio_id_match(const struct virtio_device *dev, const struct virtio_device_id *id) { if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID) return 0; return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor; } /* This looks through all the IDs a driver claims to support. If any of them * match, we return 1 and the kernel will call virtio_dev_probe(). */ static int virtio_dev_match(struct device *_dv, const struct device_driver *_dr) { unsigned int i; struct virtio_device *dev = dev_to_virtio(_dv); const struct virtio_device_id *ids; ids = drv_to_virtio(_dr)->id_table; for (i = 0; ids[i].device; i++) if (virtio_id_match(dev, &ids[i])) return 1; return 0; } static int virtio_uevent(const struct device *_dv, struct kobj_uevent_env *env) { const struct virtio_device *dev = dev_to_virtio(_dv); return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", dev->id.device, dev->id.vendor); } void virtio_check_driver_offered_feature(const struct virtio_device *vdev, unsigned int fbit) { unsigned int i; struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); for (i = 0; i < drv->feature_table_size; i++) if (drv->feature_table[i] == fbit) return; if (drv->feature_table_legacy) { for (i = 0; i < drv->feature_table_size_legacy; i++) if (drv->feature_table_legacy[i] == fbit) return; } BUG(); } EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature); static void __virtio_config_changed(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); if (!dev->config_core_enabled || dev->config_driver_disabled) dev->config_change_pending = true; else if (drv && drv->config_changed) { drv->config_changed(dev); dev->config_change_pending = false; } } void virtio_config_changed(struct virtio_device *dev) { unsigned long flags; spin_lock_irqsave(&dev->config_lock, flags); __virtio_config_changed(dev); spin_unlock_irqrestore(&dev->config_lock, flags); } EXPORT_SYMBOL_GPL(virtio_config_changed); /** * virtio_config_driver_disable - disable config change reporting by drivers * @dev: the device to reset * * This is only allowed to be called by a driver and disabling can't * be nested. */ void virtio_config_driver_disable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_driver_disabled = true; spin_unlock_irq(&dev->config_lock); } EXPORT_SYMBOL_GPL(virtio_config_driver_disable); /** * virtio_config_driver_enable - enable config change reporting by drivers * @dev: the device to reset * * This is only allowed to be called by a driver and enabling can't * be nested. */ void virtio_config_driver_enable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_driver_disabled = false; if (dev->config_change_pending) __virtio_config_changed(dev); spin_unlock_irq(&dev->config_lock); } EXPORT_SYMBOL_GPL(virtio_config_driver_enable); static void virtio_config_core_disable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_core_enabled = false; spin_unlock_irq(&dev->config_lock); } static void virtio_config_core_enable(struct virtio_device *dev) { spin_lock_irq(&dev->config_lock); dev->config_core_enabled = true; if (dev->config_change_pending) __virtio_config_changed(dev); spin_unlock_irq(&dev->config_lock); } void virtio_add_status(struct virtio_device *dev, unsigned int status) { might_sleep(); dev->config->set_status(dev, dev->config->get_status(dev) | status); } EXPORT_SYMBOL_GPL(virtio_add_status); /* Do some validation, then set FEATURES_OK */ static int virtio_features_ok(struct virtio_device *dev) { unsigned int status; might_sleep(); if (virtio_check_mem_acc_cb(dev)) { if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) { dev_warn(&dev->dev, "device must provide VIRTIO_F_VERSION_1\n"); return -ENODEV; } if (!virtio_has_feature(dev, VIRTIO_F_ACCESS_PLATFORM)) { dev_warn(&dev->dev, "device must provide VIRTIO_F_ACCESS_PLATFORM\n"); return -ENODEV; } } if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) return 0; virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); status = dev->config->get_status(dev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { dev_err(&dev->dev, "virtio: device refuses features: %x\n", status); return -ENODEV; } return 0; } /** * virtio_reset_device - quiesce device for removal * @dev: the device to reset * * Prevents device from sending interrupts and accessing memory. * * Generally used for cleanup during driver / device removal. * * Once this has been invoked, caller must ensure that * virtqueue_notify / virtqueue_kick are not in progress. * * Note: this guarantees that vq callbacks are not in progress, however caller * is responsible for preventing access from other contexts, such as a system * call/workqueue/bh. Invoking virtio_break_device then flushing any such * contexts is one way to handle that. * */ void virtio_reset_device(struct virtio_device *dev) { #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION /* * The below virtio_synchronize_cbs() guarantees that any * interrupt for this line arriving after * virtio_synchronize_vqs() has completed is guaranteed to see * vq->broken as true. */ virtio_break_device(dev); virtio_synchronize_cbs(dev); #endif dev->config->reset(dev); } EXPORT_SYMBOL_GPL(virtio_reset_device); static int virtio_dev_probe(struct device *_d) { int err, i; struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); u64 device_features; u64 driver_features; u64 driver_features_legacy; /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); /* Figure out what features the device supports. */ device_features = dev->config->get_features(dev); /* Figure out what features the driver supports. */ driver_features = 0; for (i = 0; i < drv->feature_table_size; i++) { unsigned int f = drv->feature_table[i]; BUG_ON(f >= 64); driver_features |= (1ULL << f); } /* Some drivers have a separate feature table for virtio v1.0 */ if (drv->feature_table_legacy) { driver_features_legacy = 0; for (i = 0; i < drv->feature_table_size_legacy; i++) { unsigned int f = drv->feature_table_legacy[i]; BUG_ON(f >= 64); driver_features_legacy |= (1ULL << f); } } else { driver_features_legacy = driver_features; } if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else dev->features = driver_features_legacy & device_features; /* When debugging, user may filter some features by hand. */ virtio_debug_device_filter_features(dev); /* Transport features always preserved to pass to finalize_features. */ for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) if (device_features & (1ULL << i)) __virtio_set_bit(dev, i); err = dev->config->finalize_features(dev); if (err) goto err; if (drv->validate) { u64 features = dev->features; err = drv->validate(dev); if (err) goto err; /* Did validation change any features? Then write them again. */ if (features != dev->features) { err = dev->config->finalize_features(dev); if (err) goto err; } } err = virtio_features_ok(dev); if (err) goto err; err = drv->probe(dev); if (err) goto err; /* If probe didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) virtio_device_ready(dev); if (drv->scan) drv->scan(dev); virtio_config_core_enable(dev); return 0; err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; } static void virtio_dev_remove(struct device *_d) { struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); virtio_config_core_disable(dev); drv->remove(dev); /* Driver should have reset device. */ WARN_ON_ONCE(dev->config->get_status(dev)); /* Acknowledge the device's existence again. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); of_node_put(dev->dev.of_node); } static const struct bus_type virtio_bus = { .name = "virtio", .match = virtio_dev_match, .dev_groups = virtio_dev_groups, .uevent = virtio_uevent, .probe = virtio_dev_probe, .remove = virtio_dev_remove, }; int __register_virtio_driver(struct virtio_driver *driver, struct module *owner) { /* Catch this early. */ BUG_ON(driver->feature_table_size && !driver->feature_table); driver->driver.bus = &virtio_bus; driver->driver.owner = owner; return driver_register(&driver->driver); } EXPORT_SYMBOL_GPL(__register_virtio_driver); void unregister_virtio_driver(struct virtio_driver *driver) { driver_unregister(&driver->driver); } EXPORT_SYMBOL_GPL(unregister_virtio_driver); static int virtio_device_of_init(struct virtio_device *dev) { struct device_node *np, *pnode = dev_of_node(dev->dev.parent); char compat[] = "virtio,deviceXXXXXXXX"; int ret, count; if (!pnode) return 0; count = of_get_available_child_count(pnode); if (!count) return 0; /* There can be only 1 child node */ if (WARN_ON(count > 1)) return -EINVAL; np = of_get_next_available_child(pnode, NULL); if (WARN_ON(!np)) return -ENODEV; ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device); BUG_ON(ret >= sizeof(compat)); /* * On powerpc/pseries virtio devices are PCI devices so PCI * vendor/device ids play the role of the "compatible" property. * Simply don't init of_node in this case. */ if (!of_device_is_compatible(np, compat)) { ret = 0; goto out; } dev->dev.of_node = np; return 0; out: of_node_put(np); return ret; } /** * register_virtio_device - register virtio device * @dev : virtio device to be registered * * On error, the caller must call put_device on &@dev->dev (and not kfree), * as another code path may have obtained a reference to @dev. * * Returns: 0 on suceess, -error on failure */ int register_virtio_device(struct virtio_device *dev) { int err; dev->dev.bus = &virtio_bus; device_initialize(&dev->dev); /* Assign a unique device index and hence name. */ err = ida_alloc(&virtio_index_ida, GFP_KERNEL); if (err < 0) goto out; dev->index = err; err = dev_set_name(&dev->dev, "virtio%u", dev->index); if (err) goto out_ida_remove; err = virtio_device_of_init(dev); if (err) goto out_ida_remove; spin_lock_init(&dev->config_lock); dev->config_core_enabled = false; dev->config_change_pending = false; INIT_LIST_HEAD(&dev->vqs); spin_lock_init(&dev->vqs_list_lock); /* We always start by resetting the device, in case a previous * driver messed it up. This also tests that code path a little. */ virtio_reset_device(dev); /* Acknowledge that we've seen the device. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); virtio_debug_device_init(dev); /* * device_add() causes the bus infrastructure to look for a matching * driver. */ err = device_add(&dev->dev); if (err) goto out_of_node_put; return 0; out_of_node_put: of_node_put(dev->dev.of_node); out_ida_remove: ida_free(&virtio_index_ida, dev->index); out: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; } EXPORT_SYMBOL_GPL(register_virtio_device); bool is_virtio_device(struct device *dev) { return dev->bus == &virtio_bus; } EXPORT_SYMBOL_GPL(is_virtio_device); void unregister_virtio_device(struct virtio_device *dev) { int index = dev->index; /* save for after device release */ device_unregister(&dev->dev); virtio_debug_device_exit(dev); ida_free(&virtio_index_ida, index); } EXPORT_SYMBOL_GPL(unregister_virtio_device); #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); int ret; virtio_config_core_disable(dev); dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED; if (drv && drv->freeze) { ret = drv->freeze(dev); if (ret) { virtio_config_core_enable(dev); return ret; } } return 0; } EXPORT_SYMBOL_GPL(virtio_device_freeze); int virtio_device_restore(struct virtio_device *dev) { struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); int ret; /* We always start by resetting the device, in case a previous * driver messed it up. */ virtio_reset_device(dev); /* Acknowledge that we've seen the device. */ virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); /* Maybe driver failed before freeze. * Restore the failed status, for debugging. */ if (dev->failed) virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); if (!drv) return 0; /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); ret = dev->config->finalize_features(dev); if (ret) goto err; ret = virtio_features_ok(dev); if (ret) goto err; if (drv->restore) { ret = drv->restore(dev); if (ret) goto err; } /* If restore didn't do it, mark device DRIVER_OK ourselves. */ if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK)) virtio_device_ready(dev); virtio_config_core_enable(dev); return 0; err: virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED); return ret; } EXPORT_SYMBOL_GPL(virtio_device_restore); #endif static int virtio_init(void) { if (bus_register(&virtio_bus) != 0) panic("virtio bus registration failed"); virtio_debug_init(); return 0; } static void __exit virtio_exit(void) { virtio_debug_exit(); bus_unregister(&virtio_bus); ida_destroy(&virtio_index_ida); } core_initcall(virtio_init); module_exit(virtio_exit); MODULE_DESCRIPTION("Virtio core interface"); MODULE_LICENSE("GPL"); |
4 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling core * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <net/route.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } static void fake_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } static unsigned int fake_mtu(const struct dst_entry *dst) { return dst->dev->mtu; } static struct dst_ops fake_dst_ops = { .family = AF_INET, .update_pmtu = fake_update_pmtu, .redirect = fake_redirect, .cow_metrics = fake_cow_metrics, .neigh_lookup = fake_neigh_lookup, .mtu = fake_mtu, }; /* * Initialize bogus route table used to keep netfilter happy. * Currently, we fill in the PMTU entry because netfilter * refragmentation needs it, and the rt_flags entry because * ipt_REJECT needs it. Future netfilter modules might * require us to fill additional fields. */ static const u32 br_dst_default_metrics[RTAX_MAX] = { [RTAX_MTU - 1] = 1500, }; void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; rcuref_init(&rt->dst.__rcuref, 1); rt->dst.dev = br->dev; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } int __init br_nf_core_init(void) { return dst_entries_init(&fake_dst_ops); } void br_nf_core_fini(void) { dst_entries_destroy(&fake_dst_ops); } |
1 2 1 1 4 4 2 2 2 1 2 2 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/netfilter/xt_IDLETIMER.c * * Netfilter module to trigger a timer when packet matches. * After timer expires a kevent will be sent. * * Copyright (C) 2004, 2010 Nokia Corporation * Written by Timo Teras <ext-timo.teras@nokia.com> * * Converted to x_tables and reworked for upstream inclusion * by Luciano Coelho <luciano.coelho@nokia.com> * * Contact: Luciano Coelho <luciano.coelho@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/timer.h> #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_IDLETIMER.h> #include <linux/kdev_t.h> #include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/sysfs.h> struct idletimer_tg { struct list_head entry; struct alarm alarm; struct timer_list timer; struct work_struct work; struct kobject *kobj; struct device_attribute attr; unsigned int refcnt; u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); static DEFINE_MUTEX(list_mutex); static struct kobject *idletimer_tg_kobj; static struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; } return NULL; } static ssize_t idletimer_tg_show(struct device *dev, struct device_attribute *attr, char *buf) { struct idletimer_tg *timer; unsigned long expires = 0; struct timespec64 ktimespec = {}; long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); if (timer) { if (timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); ktimespec = ktime_to_timespec64(expires_alarm); time_diff = ktimespec.tv_sec; } else { expires = timer->timer.expires; time_diff = jiffies_to_msecs(expires - jiffies) / 1000; } } mutex_unlock(&list_mutex); if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) return sysfs_emit(buf, "%ld\n", time_diff); return sysfs_emit(buf, "0\n"); } static void idletimer_tg_work(struct work_struct *work) { struct idletimer_tg *timer = container_of(work, struct idletimer_tg, work); sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); } static void idletimer_tg_expired(struct timer_list *t) { struct idletimer_tg *timer = from_timer(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; ret = xt_check_proc_name(name, size); if (ret < 0) return ret; if (!strcmp(name, "power") || !strcmp(name, "subsystem") || !strcmp(name, "uevent")) return -EINVAL; return 0; } static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } list_add(&info->timer->entry, &idletimer_tg_list); timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } /* notify userspace */ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); list_add(&info->timer->entry, &idletimer_tg_list); pr_debug("timer type value is %u", info->timer_type); info->timer->timer_type = info->timer_type; info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout; alarm_init(&info->timer->alarm, ALARM_BOOTTIME, idletimer_tg_alarmproc); info->timer->alarm.data = info->timer; tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); } return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); return XT_CONTINUE; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); } return XT_CONTINUE; } static int idletimer_tg_helper(struct idletimer_tg_info *info) { if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; } if (info->timeout >= INT_MAX / 1000) { pr_debug("timeout value is too big\n"); return -EINVAL; } if (info->label[0] == '\0' || strnlen(info->label, MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } return 0; } static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) { struct idletimer_tg_info *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); ret = idletimer_tg_helper(info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) { struct idletimer_tg_info_v1 *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); if (info->send_nl_msg) return -EOPNOTSUPP; ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } if (info->timer_type > XT_IDLETIMER_ALARM) { pr_debug("invalid value for timer type\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { if (info->timer->timer_type != info->timer_type) { pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); mutex_unlock(&list_mutex); return -EINVAL; } info->timer->refcnt++; if (info->timer_type & XT_IDLETIMER_ALARM) { /* calculate remaining expiry time */ ktime_t tout = alarm_expires_remaining(&info->timer->alarm); struct timespec64 ktimespec = ktime_to_timespec64(tout); if (ktimespec.tv_sec > 0) { pr_debug("time_expiry_remaining %lld\n", ktimespec.tv_sec); alarm_start_relative(&info->timer->alarm, tout); } } else { mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create_v1(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); timer_shutdown_sync(&info->timer->timer); cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { alarm_cancel(&info->timer->alarm); } else { timer_shutdown_sync(&info->timer->timer); } cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { .name = "IDLETIMER", .family = NFPROTO_IPV4, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV4, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "IDLETIMER", .family = NFPROTO_IPV6, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV6, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #endif }; static struct class *idletimer_tg_class; static struct device *idletimer_tg_device; static int __init idletimer_tg_init(void) { int err; idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); goto out; } idletimer_tg_device = device_create(idletimer_tg_class, NULL, MKDEV(0, 0), NULL, "timers"); err = PTR_ERR(idletimer_tg_device); if (IS_ERR(idletimer_tg_device)) { pr_debug("couldn't register system device\n"); goto out_class; } idletimer_tg_kobj = &idletimer_tg_device->kobj; err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; } return 0; out_dev: device_destroy(idletimer_tg_class, MKDEV(0, 0)); out_class: class_destroy(idletimer_tg_class); out: return err; } static void __exit idletimer_tg_exit(void) { xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); } module_init(idletimer_tg_init); module_exit(idletimer_tg_exit); MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("ipt_IDLETIMER"); MODULE_ALIAS("ip6t_IDLETIMER"); |
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 | // SPDX-License-Identifier: GPL-2.0 /* * Multipath support for RPC * * Copyright (c) 2015, 2016, Primary Data, Inc. All rights reserved. * * Trond Myklebust <trond.myklebust@primarydata.com> * */ #include <linux/atomic.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtmultipath.h> #include "sysfs.h" typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur); static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular; static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin; static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall; static const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline; static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) { if (unlikely(xprt_get(xprt) == NULL)) return; list_add_tail_rcu(&xprt->xprt_switch, &xps->xps_xprt_list); smp_wmb(); if (xps->xps_nxprts == 0) xps->xps_net = xprt->xprt_net; xps->xps_nxprts++; xps->xps_nactive++; } /** * rpc_xprt_switch_add_xprt - Add a new rpc_xprt to an rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt * * Adds xprt to the end of the list of struct rpc_xprt in xps. */ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) { if (xprt == NULL) return; spin_lock(&xps->xps_lock); if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); rpc_sysfs_xprt_setup(xps, xprt, GFP_KERNEL); } static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, bool offline) { if (unlikely(xprt == NULL)) return; if (!test_bit(XPRT_OFFLINE, &xprt->state) && offline) xps->xps_nactive--; xps->xps_nxprts--; if (xps->xps_nxprts == 0) xps->xps_net = NULL; smp_wmb(); list_del_rcu(&xprt->xprt_switch); } /** * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt * @offline: indicates if the xprt that's being removed is in an offline state * * Removes xprt from the list of struct rpc_xprt in xps. */ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, bool offline) { spin_lock(&xps->xps_lock); xprt_switch_remove_xprt_locked(xps, xprt, offline); spin_unlock(&xps->xps_lock); xprt_put(xprt); } static DEFINE_IDA(rpc_xprtswitch_ids); void xprt_multipath_cleanup_ids(void) { ida_destroy(&rpc_xprtswitch_ids); } static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) { int id; id = ida_alloc(&rpc_xprtswitch_ids, gfp_flags); if (id < 0) return id; xps->xps_id = id; return 0; } static void xprt_switch_free_id(struct rpc_xprt_switch *xps) { ida_free(&rpc_xprtswitch_ids, xps->xps_id); } /** * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt * @gfp_flags: allocation flags * * On success, returns an initialised struct rpc_xprt_switch, containing * the entry xprt. Returns NULL on failure. */ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_xprt_switch *xps; xps = kmalloc(sizeof(*xps), gfp_flags); if (xps != NULL) { spin_lock_init(&xps->xps_lock); kref_init(&xps->xps_kref); xprt_switch_alloc_id(xps, gfp_flags); xps->xps_nxprts = xps->xps_nactive = 0; atomic_long_set(&xps->xps_queuelen, 0); xps->xps_net = NULL; INIT_LIST_HEAD(&xps->xps_xprt_list); xps->xps_iter_ops = &rpc_xprt_iter_singular; rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags); xprt_switch_add_xprt_locked(xps, xprt); xps->xps_nunique_destaddr_xprts = 1; rpc_sysfs_xprt_setup(xps, xprt, gfp_flags); } return xps; } static void xprt_switch_free_entries(struct rpc_xprt_switch *xps) { spin_lock(&xps->xps_lock); while (!list_empty(&xps->xps_xprt_list)) { struct rpc_xprt *xprt; xprt = list_first_entry(&xps->xps_xprt_list, struct rpc_xprt, xprt_switch); xprt_switch_remove_xprt_locked(xps, xprt, true); spin_unlock(&xps->xps_lock); xprt_put(xprt); spin_lock(&xps->xps_lock); } spin_unlock(&xps->xps_lock); } static void xprt_switch_free(struct kref *kref) { struct rpc_xprt_switch *xps = container_of(kref, struct rpc_xprt_switch, xps_kref); xprt_switch_free_entries(xps); rpc_sysfs_xprt_switch_destroy(xps); xprt_switch_free_id(xps); kfree_rcu(xps, xps_rcu); } /** * xprt_switch_get - Return a reference to a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * * Returns a reference to xps unless the refcount is already zero. */ struct rpc_xprt_switch *xprt_switch_get(struct rpc_xprt_switch *xps) { if (xps != NULL && kref_get_unless_zero(&xps->xps_kref)) return xps; return NULL; } /** * xprt_switch_put - Release a reference to a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * * Release the reference to xps, and free it once the refcount is zero. */ void xprt_switch_put(struct rpc_xprt_switch *xps) { if (xps != NULL) kref_put(&xps->xps_kref, xprt_switch_free); } /** * rpc_xprt_switch_set_roundrobin - Set a round-robin policy on rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * * Sets a round-robin default policy for iterators acting on xps. */ void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps) { if (READ_ONCE(xps->xps_iter_ops) != &rpc_xprt_iter_roundrobin) WRITE_ONCE(xps->xps_iter_ops, &rpc_xprt_iter_roundrobin); } static const struct rpc_xprt_iter_ops *xprt_iter_ops(const struct rpc_xprt_iter *xpi) { if (xpi->xpi_ops != NULL) return xpi->xpi_ops; return rcu_dereference(xpi->xpi_xpswitch)->xps_iter_ops; } static void xprt_iter_no_rewind(struct rpc_xprt_iter *xpi) { } static void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi) { WRITE_ONCE(xpi->xpi_cursor, NULL); } static bool xprt_is_active(const struct rpc_xprt *xprt) { return (kref_read(&xprt->kref) != 0 && !test_bit(XPRT_OFFLINE, &xprt->state)); } static struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head) { struct rpc_xprt *pos; list_for_each_entry_rcu(pos, head, xprt_switch) { if (xprt_is_active(pos)) return pos; } return NULL; } static struct rpc_xprt *xprt_switch_find_first_entry_offline(struct list_head *head) { struct rpc_xprt *pos; list_for_each_entry_rcu(pos, head, xprt_switch) { if (!xprt_is_active(pos)) return pos; } return NULL; } static struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); if (xps == NULL) return NULL; return xprt_switch_find_first_entry(&xps->xps_xprt_list); } static struct rpc_xprt *_xprt_switch_find_current_entry(struct list_head *head, const struct rpc_xprt *cur, bool find_active) { struct rpc_xprt *pos; bool found = false; list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == pos) found = true; if (found && ((find_active && xprt_is_active(pos)) || (!find_active && !xprt_is_active(pos)))) return pos; } return NULL; } static struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, const struct rpc_xprt *cur) { return _xprt_switch_find_current_entry(head, cur, true); } static struct rpc_xprt * _xprt_iter_current_entry(struct rpc_xprt_iter *xpi, struct rpc_xprt *first_entry(struct list_head *head), struct rpc_xprt *current_entry(struct list_head *head, const struct rpc_xprt *cur)) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); struct list_head *head; if (xps == NULL) return NULL; head = &xps->xps_xprt_list; if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2) return first_entry(head); return current_entry(head, xpi->xpi_cursor); } static struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) { return _xprt_iter_current_entry(xpi, xprt_switch_find_first_entry, xprt_switch_find_current_entry); } static struct rpc_xprt *xprt_switch_find_current_entry_offline(struct list_head *head, const struct rpc_xprt *cur) { return _xprt_switch_find_current_entry(head, cur, false); } static struct rpc_xprt *xprt_iter_current_entry_offline(struct rpc_xprt_iter *xpi) { return _xprt_iter_current_entry(xpi, xprt_switch_find_first_entry_offline, xprt_switch_find_current_entry_offline); } static bool __rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, const struct sockaddr *sap) { struct list_head *head; struct rpc_xprt *pos; if (xps == NULL || sap == NULL) return false; head = &xps->xps_xprt_list; list_for_each_entry_rcu(pos, head, xprt_switch) { if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) { pr_info("RPC: addr %s already in xprt switch\n", pos->address_strings[RPC_DISPLAY_ADDR]); return true; } } return false; } bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, const struct sockaddr *sap) { bool res; rcu_read_lock(); res = __rpc_xprt_switch_has_addr(xps, sap); rcu_read_unlock(); return res; } static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, const struct rpc_xprt *cur, bool check_active) { struct rpc_xprt *pos, *prev = NULL; bool found = false; list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == prev) found = true; /* for request to return active transports return only * active, for request to return offline transports * return only offline */ if (found && ((check_active && xprt_is_active(pos)) || (!check_active && !xprt_is_active(pos)))) return pos; prev = pos; } return NULL; } static struct rpc_xprt *xprt_switch_set_next_cursor(struct rpc_xprt_switch *xps, struct rpc_xprt **cursor, xprt_switch_find_xprt_t find_next) { struct rpc_xprt *pos, *old; old = smp_load_acquire(cursor); pos = find_next(xps, old); smp_store_release(cursor, pos); return pos; } static struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi, xprt_switch_find_xprt_t find_next) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); if (xps == NULL) return NULL; return xprt_switch_set_next_cursor(xps, &xpi->xpi_cursor, find_next); } static struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head, const struct rpc_xprt *cur) { struct rpc_xprt *ret; ret = xprt_switch_find_next_entry(head, cur, true); if (ret != NULL) return ret; return xprt_switch_find_first_entry(head); } static struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur) { struct list_head *head = &xps->xps_xprt_list; struct rpc_xprt *xprt; unsigned int nactive; for (;;) { unsigned long xprt_queuelen, xps_queuelen; xprt = __xprt_switch_find_next_entry_roundrobin(head, cur); if (!xprt) break; xprt_queuelen = atomic_long_read(&xprt->queuelen); xps_queuelen = atomic_long_read(&xps->xps_queuelen); nactive = READ_ONCE(xps->xps_nactive); /* Exit loop if xprt_queuelen <= average queue length */ if (xprt_queuelen * nactive <= xps_queuelen) break; cur = xprt; } return xprt; } static struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi) { return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry_roundrobin); } static struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur) { return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, true); } static struct rpc_xprt *xprt_switch_find_next_entry_offline(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur) { return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, false); } static struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi) { return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry_all); } static struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi) { return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry_offline); } /* * xprt_iter_rewind - Resets the xprt iterator * @xpi: pointer to rpc_xprt_iter * * Resets xpi to ensure that it points to the first entry in the list * of transports. */ void xprt_iter_rewind(struct rpc_xprt_iter *xpi) { rcu_read_lock(); xprt_iter_ops(xpi)->xpi_rewind(xpi); rcu_read_unlock(); } static void __xprt_iter_init(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps, const struct rpc_xprt_iter_ops *ops) { rcu_assign_pointer(xpi->xpi_xpswitch, xprt_switch_get(xps)); xpi->xpi_cursor = NULL; xpi->xpi_ops = ops; } /** * xprt_iter_init - Initialise an xprt iterator * @xpi: pointer to rpc_xprt_iter * @xps: pointer to rpc_xprt_switch * * Initialises the iterator to use the default iterator ops * as set in xps. This function is mainly intended for internal * use in the rpc_client. */ void xprt_iter_init(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps) { __xprt_iter_init(xpi, xps, NULL); } /** * xprt_iter_init_listall - Initialise an xprt iterator * @xpi: pointer to rpc_xprt_iter * @xps: pointer to rpc_xprt_switch * * Initialises the iterator to iterate once through the entire list * of entries in xps. */ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps) { __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall); } void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps) { __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listoffline); } /** * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch * @xpi: pointer to rpc_xprt_iter * @newswitch: pointer to a new rpc_xprt_switch or NULL * * Swaps out the existing xpi->xpi_xpswitch with a new value. */ struct rpc_xprt_switch *xprt_iter_xchg_switch(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *newswitch) { struct rpc_xprt_switch __rcu *oldswitch; /* Atomically swap out the old xpswitch */ oldswitch = xchg(&xpi->xpi_xpswitch, RCU_INITIALIZER(newswitch)); if (newswitch != NULL) xprt_iter_rewind(xpi); return rcu_dereference_protected(oldswitch, true); } /** * xprt_iter_destroy - Destroys the xprt iterator * @xpi: pointer to rpc_xprt_iter */ void xprt_iter_destroy(struct rpc_xprt_iter *xpi) { xprt_switch_put(xprt_iter_xchg_switch(xpi, NULL)); } /** * xprt_iter_xprt - Returns the rpc_xprt pointed to by the cursor * @xpi: pointer to rpc_xprt_iter * * Returns a pointer to the struct rpc_xprt that is currently * pointed to by the cursor. * Caller must be holding rcu_read_lock(). */ struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi) { WARN_ON_ONCE(!rcu_read_lock_held()); return xprt_iter_ops(xpi)->xpi_xprt(xpi); } static struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi, struct rpc_xprt *(*fn)(struct rpc_xprt_iter *)) { struct rpc_xprt *ret; do { ret = fn(xpi); if (ret == NULL) break; ret = xprt_get(ret); } while (ret == NULL); return ret; } /** * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor * @xpi: pointer to rpc_xprt_iter * * Returns a reference to the struct rpc_xprt that is currently * pointed to by the cursor. */ struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt); rcu_read_unlock(); return xprt; } /** * xprt_iter_get_next - Returns the next rpc_xprt following the cursor * @xpi: pointer to rpc_xprt_iter * * Returns a reference to the struct rpc_xprt that immediately follows the * entry pointed to by the cursor. */ struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_next); rcu_read_unlock(); return xprt; } /* Policy for always returning the first entry in the rpc_xprt_switch */ static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular = { .xpi_rewind = xprt_iter_no_rewind, .xpi_xprt = xprt_iter_first_entry, .xpi_next = xprt_iter_first_entry, }; /* Policy for round-robin iteration of entries in the rpc_xprt_switch */ static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin = { .xpi_rewind = xprt_iter_default_rewind, .xpi_xprt = xprt_iter_current_entry, .xpi_next = xprt_iter_next_entry_roundrobin, }; /* Policy for once-through iteration of entries in the rpc_xprt_switch */ static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = { .xpi_rewind = xprt_iter_default_rewind, .xpi_xprt = xprt_iter_current_entry, .xpi_next = xprt_iter_next_entry_all, }; static const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline = { .xpi_rewind = xprt_iter_default_rewind, .xpi_xprt = xprt_iter_current_entry_offline, .xpi_next = xprt_iter_next_entry_offline, }; |
1 1 25 24 1 15 15 24 7 5 2 4 6 7 7 6 2 35 35 3 3 3 26 8 7 8 1 2 4 4 4 4 4 14 13 7 8 139 64 76 17 76 78 124 26 140 78 35 42 140 4 77 25 90 161 163 163 162 164 161 164 161 27 162 128 124 124 72 35 77 140 1 1 25 25 26 26 26 1 25 74 6 143 10 1 1 1 1 8 126 128 128 23 23 8 23 78 24 78 123 35 3 26 1 7 4 26 8 10 5 7 7 38 26 89 90 26 26 164 44 164 163 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement that state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Hui Huang <hui.huang@nokia.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/ip.h> #include <linux/gfp.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association **asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); /******************************************************************** * Helper functions ********************************************************************/ /* A helper function for delayed processing of INET ECN CE bit. */ static void sctp_do_ecn_ce_work(struct sctp_association *asoc, __u32 lowest_tsn) { /* Save the TSN away for comparison when we receive CWR */ asoc->last_ecne_tsn = lowest_tsn; asoc->need_ecne = 1; } /* Helper function for delayed processing of SCTP ECNE chunk. */ /* RFC 2960 Appendix A * * RFC 2481 details a specific bit for a sender to send in * the header of its next outbound TCP segment to indicate to * its peer that it has reduced its congestion window. This * is termed the CWR bit. For SCTP the same indication is made * by including the CWR chunk. This chunk contains one data * element, i.e. the TSN number that was sent in the ECNE chunk. * This element represents the lowest TSN number in the datagram * that was originally marked with the CE bit. */ static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc, __u32 lowest_tsn, struct sctp_chunk *chunk) { struct sctp_chunk *repl; /* Our previously transmitted packet ran into some congestion * so we should take action by reducing cwnd and ssthresh * and then ACK our peer that we we've done so by * sending a CWR. */ /* First, try to determine if we want to actually lower * our cwnd variables. Only lower them if the ECNE looks more * recent than the last response. */ if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) { struct sctp_transport *transport; /* Find which transport's congestion variables * need to be adjusted. */ transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn); /* Update the congestion variables. */ if (transport) sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_ECNE); asoc->last_cwr_tsn = lowest_tsn; } /* Always try to quiet the other end. In case of lost CWR, * resend last_cwr_tsn. */ repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk); /* If we run out of memory, it will look like a lost CWR. We'll * get back in sync eventually. */ return repl; } /* Helper function to do delayed processing of ECN CWR chunk. */ static void sctp_do_ecn_cwr_work(struct sctp_association *asoc, __u32 lowest_tsn) { /* Turn off ECNE getting auto-prepended to every outgoing * packet */ asoc->need_ecne = 0; } /* Generate SACK if necessary. We call this at the end of a packet. */ static int sctp_gen_sack(struct sctp_association *asoc, int force, struct sctp_cmd_seq *commands) { struct sctp_transport *trans = asoc->peer.last_data_from; __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; int error = 0; if (force || (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); /* From 12.2 Parameters necessary per association (i.e. the TCB): * * Ack State : This flag indicates if the next received packet * : is to be responded to with a SACK. ... * : When DATA chunks are out of order, SACK's * : are not delayed (see Section 6). * * [This is actually not mentioned in Section 6, but we * implement it here anyway. --piggy] */ if (max_tsn_seen != ctsn) asoc->peer.sack_needed = 1; /* From 6.2 Acknowledgement on Reception of DATA Chunks: * * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, * an acknowledgement SHOULD be generated for at least every * second packet (not every second DATA chunk) received, and * SHOULD be generated within 200 ms of the arrival of any * unacknowledged DATA chunk. ... */ if (!asoc->peer.sack_needed) { asoc->peer.sack_cnt++; /* Set the SACK delay timeout based on the * SACK delay for the last transport * data was received from, or the default * for the association. */ if (trans) { /* We will need a SACK for the next packet. */ if (asoc->peer.sack_cnt >= trans->sackfreq - 1) asoc->peer.sack_needed = 1; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = trans->sackdelay; } else { /* We will need a SACK for the next packet. */ if (asoc->peer.sack_cnt >= asoc->sackfreq - 1) asoc->peer.sack_needed = 1; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; } /* Restart the SACK timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { __u32 old_a_rwnd = asoc->a_rwnd; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (!sack) { asoc->a_rwnd = old_a_rwnd; goto nomem; } asoc->peer.sack_needed = 0; asoc->peer.sack_cnt = 0; sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(sack)); /* Stop the SACK timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } return error; nomem: error = -ENOMEM; return error; } /* When the T3-RTX timer expires, it calls this function to create the * relevant state machine event. */ void sctp_generate_t3_rtx_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, T3_rtx_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error; /* Check whether a task is in the sock. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* This is a sa interface for producing timeout events. It works * for timeouts which use the association as their parameter. */ static void sctp_generate_timeout_event(struct sctp_association *asoc, enum sctp_event_timeout timeout_type) { struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy: timer %d\n", __func__, timeout_type); /* Try again later. */ if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20))) sctp_association_hold(asoc); goto out_unlock; } /* Is this association really dead and just waiting around for * the timer to let go of the reference? */ if (asoc->base.dead) goto out_unlock; /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(timeout_type), asoc->state, asoc->ep, asoc, (void *)timeout_type, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_association_put(asoc); } static void sctp_generate_t1_cookie_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE); } static void sctp_generate_t1_init_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_INIT]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT); } static void sctp_generate_t2_shutdown_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN); } static void sctp_generate_t4_rto_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO); } static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD); } /* sctp_generate_t5_shutdown_guard_event() */ static void sctp_generate_autoclose_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE); } /* Generate a heart beat event. If the sock is busy, reschedule. Make * sure that the transport is still valid. */ void sctp_generate_heartbeat_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, hb_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); u32 elapsed, timeout; int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Check if we should still send the heartbeat or reschedule */ elapsed = jiffies - transport->last_time_sent; timeout = sctp_transport_timeout(transport); if (elapsed < timeout) { elapsed = timeout - elapsed; if (!mod_timer(&transport->hb_timer, jiffies + elapsed)) sctp_transport_hold(transport); goto out_unlock; } error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the ICMP protocol unreachable timer. Trigger * the correct state machine transition that will close the association. */ void sctp_generate_proto_unreach_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, proto_unreach_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Is this structure just waiting around for us to actually * get destroyed? */ if (asoc->base.dead) goto out_unlock; sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the RE-CONFIG timer. */ void sctp_generate_reconf_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, reconf_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20))) sctp_transport_hold(transport); goto out_unlock; } /* This happens when the response arrives after the timer is triggered. */ if (!asoc->strreset_chunk) goto out_unlock; error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the probe timer. */ void sctp_generate_probe_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, probe_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->probe_timer, jiffies + (HZ / 20))) sctp_transport_hold(transport); goto out_unlock; } error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_PROBE), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Inject a SACK Timeout event into the state machine. */ static void sctp_generate_sack_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK); } sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { [SCTP_EVENT_TIMEOUT_NONE] = NULL, [SCTP_EVENT_TIMEOUT_T1_COOKIE] = sctp_generate_t1_cookie_event, [SCTP_EVENT_TIMEOUT_T1_INIT] = sctp_generate_t1_init_event, [SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = sctp_generate_t2_shutdown_event, [SCTP_EVENT_TIMEOUT_T3_RTX] = NULL, [SCTP_EVENT_TIMEOUT_T4_RTO] = sctp_generate_t4_rto_event, [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] = sctp_generate_t5_shutdown_guard_event, [SCTP_EVENT_TIMEOUT_HEARTBEAT] = NULL, [SCTP_EVENT_TIMEOUT_RECONF] = NULL, [SCTP_EVENT_TIMEOUT_SACK] = sctp_generate_sack_event, [SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sctp_generate_autoclose_event, }; /* RFC 2960 8.2 Path Failure Detection * * When its peer endpoint is multi-homed, an endpoint should keep a * error counter for each of the destination transport addresses of the * peer endpoint. * * Each time the T3-rtx timer expires on any address, or when a * HEARTBEAT sent to an idle address is not acknowledged within a RTO, * the error counter of that destination address will be incremented. * When the value in the error counter exceeds the protocol parameter * 'Path.Max.Retrans' of that destination address, the endpoint should * mark the destination transport address as inactive, and a * notification SHOULD be sent to the upper layer. * */ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, struct sctp_association *asoc, struct sctp_transport *transport, int is_hb) { /* The check for association's overall error counter exceeding the * threshold is done in the state function. */ /* We are here due to a timer expiration. If the timer was * not a HEARTBEAT, then normal error tracking is done. * If the timer was a heartbeat, we only increment error counts * when we already have an outstanding HEARTBEAT that has not * been acknowledged. * Additionally, some tranport states inhibit error increments. */ if (!is_hb) { asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE) transport->error_count++; } else if (transport->hb_sent) { if (transport->state != SCTP_UNCONFIRMED) asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE) transport->error_count++; } /* If the transport error count is greater than the pf_retrans * threshold, and less than pathmaxrtx, and if the current state * is SCTP_ACTIVE, then mark this transport as Partially Failed, * see SCTP Quick Failover Draft, section 5.1 */ if (asoc->base.net->sctp.pf_enable && transport->state == SCTP_ACTIVE && transport->error_count < transport->pathmaxrxt && transport->error_count > transport->pf_retrans) { sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_PF, 0); /* Update the hb timer to resend a heartbeat every rto */ sctp_transport_reset_hb_timer(transport); } if (transport->state != SCTP_INACTIVE && (transport->error_count > transport->pathmaxrxt)) { pr_debug("%s: association:%p transport addr:%pISpc failed\n", __func__, asoc, &transport->ipaddr.sa); sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_DOWN, SCTP_FAILED_THRESHOLD); } if (transport->error_count > transport->ps_retrans && asoc->peer.primary_path == transport && asoc->peer.active_path != transport) sctp_assoc_set_primary(asoc, asoc->peer.active_path); /* E2) For the destination address for which the timer * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be * used to provide an upper bound to this doubling operation. * * Special Case: the first HB doesn't trigger exponential backoff. * The first unacknowledged HB triggers it. We do this with a flag * that indicates that we have an outstanding HB. */ if (!is_hb || transport->hb_sent) { transport->rto = min((transport->rto * 2), transport->asoc->rto_max); sctp_max_rto(asoc, transport); } } /* Worker routine to handle INIT command failure. */ static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, unsigned int error) { struct sctp_ulpevent *event; event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_CANT_STR_ASSOC, (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); /* SEND_FAILED sent later when cleaning up the association. */ asoc->outqueue.error = error; sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); } /* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, enum sctp_event_type event_type, union sctp_subtype subtype, struct sctp_chunk *chunk, unsigned int error) { struct sctp_ulpevent *event; struct sctp_chunk *abort; /* Cancel any partial delivery in progress. */ asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC); if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, (__u16)error, 0, 0, chunk, GFP_ATOMIC); else event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event)); if (asoc->overall_error_count >= asoc->max_retrans) { abort = sctp_make_violation_max_retrans(asoc, chunk); if (abort) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); } sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); /* SEND_FAILED sent later when cleaning up the association. */ asoc->outqueue.error = error; sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); } /* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT * inside the cookie. In reality, this is only used for INIT-ACK processing * since all other cases use "temporary" associations and can do all * their work in statefuns directly. */ static int sctp_cmd_process_init(struct sctp_cmd_seq *commands, struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_init_chunk *peer_init, gfp_t gfp) { int error; /* We only process the init as a sideeffect in a single * case. This is when we process the INIT-ACK. If we * fail during INIT processing (due to malloc problems), * just return the error and stop processing the stack. */ if (!sctp_process_init(asoc, chunk, sctp_source(chunk), peer_init, gfp)) error = -ENOMEM; else error = 0; return error; } /* Helper function to break out starting up of heartbeat timers. */ static void sctp_cmd_hb_timers_start(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; /* Start a heartbeat timer for each transport on the association. * hold a reference on the transport to make sure none of * the needed data structures go away. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) sctp_transport_reset_hb_timer(t); } static void sctp_cmd_hb_timers_stop(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; /* Stop all heartbeat timers. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (del_timer(&t->hb_timer)) sctp_transport_put(t); } } /* Helper function to stop any pending T3-RTX timers */ static void sctp_cmd_t3_rtx_timers_stop(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (del_timer(&t->T3_rtx_timer)) sctp_transport_put(t); } } /* Helper function to handle the reception of an HEARTBEAT ACK. */ static void sctp_cmd_transport_on(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_transport *t, struct sctp_chunk *chunk) { struct sctp_sender_hb_info *hbinfo; int was_unconfirmed = 0; /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the * HEARTBEAT should clear the error counter of the destination * transport address to which the HEARTBEAT was sent. */ t->error_count = 0; /* * Although RFC4960 specifies that the overall error count must * be cleared when a HEARTBEAT ACK is received, we make an * exception while in SHUTDOWN PENDING. If the peer keeps its * window shut forever, we may never be able to transmit our * outstanding data and rely on the retransmission limit be reached * to shutdown the association. */ if (t->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) t->asoc->overall_error_count = 0; /* Clear the hb_sent flag to signal that we had a good * acknowledgement. */ t->hb_sent = 0; /* Mark the destination transport address as active if it is not so * marked. */ if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) { was_unconfirmed = 1; sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, SCTP_HEARTBEAT_SUCCESS); } if (t->state == SCTP_PF) sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, SCTP_HEARTBEAT_SUCCESS); /* HB-ACK was received for a the proper HB. Consider this * forward progress. */ if (t->dst) sctp_transport_dst_confirm(t); /* The receiver of the HEARTBEAT ACK should also perform an * RTT measurement for that destination transport address * using the time value carried in the HEARTBEAT ACK chunk. * If the transport's rto_pending variable has been cleared, * it was most likely due to a retransmit. However, we want * to re-enable it to properly update the rto. */ if (t->rto_pending == 0) t->rto_pending = 1; hbinfo = (struct sctp_sender_hb_info *)chunk->skb->data; sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at)); /* Update the heartbeat timer. */ sctp_transport_reset_hb_timer(t); if (was_unconfirmed && asoc->peer.transport_count == 1) sctp_transport_immediate_rtx(t); } /* Helper function to process the process SACK command. */ static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { int err = 0; if (sctp_outq_sack(&asoc->outqueue, chunk)) { /* There are no more TSNs awaiting SACK. */ err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), asoc->state, asoc->ep, asoc, NULL, GFP_ATOMIC); } return err; } /* Helper function to set the timeout value for T2-SHUTDOWN timer and to set * the transport for a shutdown chunk. */ static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_transport *t; if (chunk->transport) t = chunk->transport; else { t = sctp_assoc_choose_alter_transport(asoc, asoc->shutdown_last_sent_to); chunk->transport = t; } asoc->shutdown_last_sent_to = t; asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto; } /* Helper function to change the state of an association. */ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, enum sctp_state state) { struct sock *sk = asoc->base.sk; asoc->state = state; pr_debug("%s: asoc:%p[%s]\n", __func__, asoc, sctp_state_tbl[state]); if (sctp_style(sk, TCP)) { /* Change the sk->sk_state of a TCP-style socket that has * successfully completed a connect() call. */ if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) inet_sk_set_state(sk, SCTP_SS_ESTABLISHED); /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ if (sctp_state(asoc, SHUTDOWN_RECEIVED) && sctp_sstate(sk, ESTABLISHED)) { inet_sk_set_state(sk, SCTP_SS_CLOSING); sk->sk_shutdown |= RCV_SHUTDOWN; } } if (sctp_state(asoc, COOKIE_WAIT)) { /* Reset init timeouts since they may have been * increased due to timer expirations. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; } if (sctp_state(asoc, ESTABLISHED)) { kfree(asoc->peer.cookie); asoc->peer.cookie = NULL; } if (sctp_state(asoc, ESTABLISHED) || sctp_state(asoc, CLOSED) || sctp_state(asoc, SHUTDOWN_RECEIVED)) { /* Wake up any processes waiting in the asoc's wait queue in * sctp_wait_for_connect() or sctp_wait_for_sndbuf(). */ if (waitqueue_active(&asoc->wait)) wake_up_interruptible(&asoc->wait); /* Wake up any processes waiting in the sk's sleep queue of * a TCP-style or UDP-style peeled-off socket in * sctp_wait_for_accept() or sctp_wait_for_packet(). * For a UDP-style socket, the waiters are woken up by the * notifications. */ if (!sctp_style(sk, UDP)) sk->sk_state_change(sk); } if (sctp_state(asoc, SHUTDOWN_PENDING) && !sctp_outq_is_empty(&asoc->outqueue)) sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC); } /* Helper function to delete an association. */ static void sctp_cmd_delete_tcb(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; /* If it is a non-temporary association belonging to a TCP-style * listening socket that is not closed, do not free it so that accept() * can pick it up later. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) && (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK)) return; sctp_association_free(asoc); } /* * ADDIP Section 4.1 ASCONF Chunk Procedures * A4) Start a T-4 RTO timer, using the RTO value of the selected * destination address (we use active path instead of primary path just * because primary path may be inactive. */ static void sctp_cmd_setup_t4(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_transport *t; t = sctp_assoc_choose_alter_transport(asoc, chunk->transport); asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto; chunk->transport = t; } /* Process an incoming Operation Error Chunk. */ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_errhdr *err_hdr; struct sctp_ulpevent *ev; while (chunk->chunk_end > chunk->skb->data) { err_hdr = (struct sctp_errhdr *)(chunk->skb->data); ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, GFP_ATOMIC); if (!ev) return; asoc->stream.si->enqueue_event(&asoc->ulpq, ev); switch (err_hdr->cause) { case SCTP_ERROR_UNKNOWN_CHUNK: { struct sctp_chunkhdr *unk_chunk_hdr; unk_chunk_hdr = (struct sctp_chunkhdr *)(err_hdr + 1); switch (unk_chunk_hdr->type) { /* ADDIP 4.1 A9) If the peer responds to an ASCONF with * an ERROR chunk reporting that it did not recognized * the ASCONF chunk type, the sender of the ASCONF MUST * NOT send any further ASCONF chunks and MUST stop its * T-4 timer. */ case SCTP_CID_ASCONF: if (asoc->peer.asconf_capable == 0) break; asoc->peer.asconf_capable = 0; sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); break; default: break; } break; } default: break; } } } /* Helper function to remove the association non-primary peer * transports. */ static void sctp_cmd_del_non_primary(struct sctp_association *asoc) { struct sctp_transport *t; struct list_head *temp; struct list_head *pos; list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); if (!sctp_cmp_addr_exact(&t->ipaddr, &asoc->peer.primary_addr)) { sctp_assoc_rm_peer(asoc, t); } } } /* Helper function to set sk_err on a 1-1 style socket. */ static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error) { struct sock *sk = asoc->base.sk; if (!sctp_style(sk, UDP)) sk->sk_err = error; } /* Helper function to generate an association change event */ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, struct sctp_association *asoc, u8 state) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, NULL, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands, struct sctp_association *asoc) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } /* Helper function to generate an adaptation indication event */ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, struct sctp_association *asoc) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } static void sctp_cmd_t1_timer_update(struct sctp_association *asoc, enum sctp_event_timeout timer, char *name) { struct sctp_transport *t; t = asoc->init_last_sent_to; asoc->init_err_counter++; if (t->init_sent_count > (asoc->init_cycle + 1)) { asoc->timeouts[timer] *= 2; if (asoc->timeouts[timer] > asoc->max_init_timeo) { asoc->timeouts[timer] = asoc->max_init_timeo; } asoc->init_cycle++; pr_debug("%s: T1[%s] timeout adjustment init_err_counter:%d" " cycle:%d timeout:%ld\n", __func__, name, asoc->init_err_counter, asoc->init_cycle, asoc->timeouts[timer]); } } /* Send the whole message, chunk by chunk, to the outqueue. * This way the whole message is queued up and bundling if * encouraged for small fragments. */ static void sctp_cmd_send_msg(struct sctp_association *asoc, struct sctp_datamsg *msg, gfp_t gfp) { struct sctp_chunk *chunk; list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_outq_tail(&asoc->outqueue, chunk, gfp); asoc->outqueue.sched->enqueue(&asoc->outqueue, msg); } /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real * functionality there. */ #define debug_pre_sfn() \ pr_debug("%s[pre-fn]: ep:%p, %s, %s, asoc:%p[%s], %s\n", __func__, \ ep, sctp_evttype_tbl[event_type], (*debug_fn)(subtype), \ asoc, sctp_state_tbl[state], state_fn->name) #define debug_post_sfn() \ pr_debug("%s[post-fn]: asoc:%p, status:%s\n", __func__, asoc, \ sctp_status_tbl[status]) #define debug_post_sfx() \ pr_debug("%s[post-sfx]: error:%d, asoc:%p[%s]\n", __func__, error, \ asoc, sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \ sctp_assoc2id(asoc))) ? asoc->state : SCTP_STATE_CLOSED]) /* * This is the master state machine processing function. * * If you want to understand all of lksctp, this is a * good place to start. */ int sctp_do_sm(struct net *net, enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp) { typedef const char *(printfn_t)(union sctp_subtype); static printfn_t *table[] = { NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname, }; printfn_t *debug_fn __attribute__ ((unused)) = table[event_type]; const struct sctp_sm_table_entry *state_fn; struct sctp_cmd_seq commands; enum sctp_disposition status; int error = 0; /* Look up the state function, run it, and then process the * side effects. These three steps are the heart of lksctp. */ state_fn = sctp_sm_lookup_event(net, event_type, state, subtype); sctp_init_cmd_seq(&commands); debug_pre_sfn(); status = state_fn->fn(net, ep, asoc, subtype, event_arg, &commands); debug_post_sfn(); error = sctp_side_effects(event_type, subtype, state, ep, &asoc, event_arg, status, &commands, gfp); debug_post_sfx(); return error; } /***************************************************************** * This the master state function side effect processing function. *****************************************************************/ static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association **asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp) { int error; /* FIXME - Most of the dispositions left today would be categorized * as "exceptional" dispositions. For those dispositions, it * may not be proper to run through any of the commands at all. * For example, the command interpreter might be run only with * disposition SCTP_DISPOSITION_CONSUME. */ if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state, ep, *asoc, event_arg, status, commands, gfp))) goto bail; switch (status) { case SCTP_DISPOSITION_DISCARD: pr_debug("%s: ignored sctp protocol event - state:%d, " "event_type:%d, event_id:%d\n", __func__, state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_NOMEM: /* We ran out of memory, so we need to discard this * packet. */ /* BUG--we should now recover some memory, probably by * reneging... */ error = -ENOMEM; break; case SCTP_DISPOSITION_DELETE_TCB: case SCTP_DISPOSITION_ABORT: /* This should now be a command. */ *asoc = NULL; break; case SCTP_DISPOSITION_CONSUME: /* * We should no longer have much work to do here as the * real work has been done as explicit commands above. */ break; case SCTP_DISPOSITION_VIOLATION: net_err_ratelimited("protocol violation state %d chunkid %d\n", state, subtype.chunk); break; case SCTP_DISPOSITION_NOT_IMPL: pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_BUG: pr_err("bug in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); BUG(); break; default: pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", status, state, event_type, subtype.chunk); error = status; if (error >= 0) error = -EINVAL; WARN_ON_ONCE(1); break; } bail: return error; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This is the side-effect interpreter. */ static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp) { struct sctp_sock *sp = sctp_sk(ep->base.sk); struct sctp_chunk *chunk = NULL, *new_obj; struct sctp_packet *packet; struct sctp_sackhdr sackh; struct timer_list *timer; struct sctp_transport *t; unsigned long timeout; struct sctp_cmd *cmd; int local_cork = 0; int error = 0; int force; if (SCTP_EVENT_T_TIMEOUT != event_type) chunk = event_arg; /* Note: This whole file is a huge candidate for rework. * For example, each command could either have its own handler, so * the loop would look like: * while (cmds) * cmd->handle(x, y, z) * --jgrimm */ while (NULL != (cmd = sctp_next_cmd(commands))) { switch (cmd->verb) { case SCTP_CMD_NOP: /* Do nothing. */ break; case SCTP_CMD_NEW_ASOC: /* Register a new association. */ if (local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Register with the endpoint. */ asoc = cmd->obj.asoc; BUG_ON(asoc->peer.primary_path == NULL); sctp_endpoint_add_asoc(ep, asoc); break; case SCTP_CMD_PURGE_OUTQUEUE: sctp_outq_teardown(&asoc->outqueue); break; case SCTP_CMD_DELETE_TCB: if (local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Delete the current association. */ sctp_cmd_delete_tcb(commands, asoc); asoc = NULL; break; case SCTP_CMD_NEW_STATE: /* Enter a new state. */ sctp_cmd_new_state(commands, asoc, cmd->obj.state); break; case SCTP_CMD_REPORT_TSN: /* Record the arrival of a TSN. */ error = sctp_tsnmap_mark(&asoc->peer.tsn_map, cmd->obj.u32, NULL); break; case SCTP_CMD_REPORT_FWDTSN: asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32); break; case SCTP_CMD_PROCESS_FWDTSN: asoc->stream.si->handle_ftsn(&asoc->ulpq, cmd->obj.chunk); break; case SCTP_CMD_GEN_SACK: /* Generate a Selective ACK. * The argument tells us whether to just count * the packet and MAYBE generate a SACK, or * force a SACK out. */ force = cmd->obj.i32; error = sctp_gen_sack(asoc, force, commands); break; case SCTP_CMD_PROCESS_SACK: /* Process an inbound SACK. */ error = sctp_cmd_process_sack(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_GEN_INIT_ACK: /* Generate an INIT ACK chunk. */ new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, 0); if (!new_obj) { error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_PEER_INIT: /* Process a unified INIT from the peer. * Note: Only used during INIT-ACK processing. If * there is an error just return to the outter * layer which will bail. */ error = sctp_cmd_process_init(commands, asoc, chunk, cmd->obj.init, gfp); break; case SCTP_CMD_GEN_COOKIE_ECHO: /* Generate a COOKIE ECHO chunk. */ new_obj = sctp_make_cookie_echo(asoc, chunk); if (!new_obj) { if (cmd->obj.chunk) sctp_chunk_free(cmd->obj.chunk); error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); /* If there is an ERROR chunk to be sent along with * the COOKIE_ECHO, send it, too. */ if (cmd->obj.chunk) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(cmd->obj.chunk)); if (new_obj->transport) { new_obj->transport->init_sent_count++; asoc->init_last_sent_to = new_obj->transport; } /* FIXME - Eventually come up with a cleaner way to * enabling COOKIE-ECHO + DATA bundling during * multihoming stale cookie scenarios, the following * command plays with asoc->peer.retran_path to * avoid the problem of sending the COOKIE-ECHO and * DATA in different paths, which could result * in the association being ABORTed if the DATA chunk * is processed first by the server. Checking the * init error counter simply causes this command * to be executed only during failed attempts of * association establishment. */ if ((asoc->peer.retran_path != asoc->peer.primary_path) && (asoc->init_err_counter > 0)) { sctp_add_cmd_sf(commands, SCTP_CMD_FORCE_PRIM_RETRAN, SCTP_NULL()); } break; case SCTP_CMD_GEN_SHUTDOWN: /* Generate SHUTDOWN when in SHUTDOWN_SENT state. * Reset error counts. */ asoc->overall_error_count = 0; /* Generate a SHUTDOWN chunk. */ new_obj = sctp_make_shutdown(asoc, chunk); if (!new_obj) { error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_CHUNK_ULP: /* Send a chunk to the sockets layer. */ pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n", __func__, cmd->obj.chunk, &asoc->ulpq); asoc->stream.si->ulpevent_data(&asoc->ulpq, cmd->obj.chunk, GFP_ATOMIC); break; case SCTP_CMD_EVENT_ULP: /* Send a notification to the sockets layer. */ pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n", __func__, cmd->obj.ulpevent, &asoc->ulpq); asoc->stream.si->enqueue_event(&asoc->ulpq, cmd->obj.ulpevent); break; case SCTP_CMD_REPLY: /* If an caller has not already corked, do cork. */ if (!asoc->outqueue.cork) { sctp_outq_cork(&asoc->outqueue); local_cork = 1; } /* Send a chunk to our peer. */ sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp); break; case SCTP_CMD_SEND_PKT: /* Send a full packet to our peer. */ packet = cmd->obj.packet; sctp_packet_transmit(packet, gfp); sctp_ootb_pkt_free(packet); break; case SCTP_CMD_T1_RETRAN: /* Mark a transport for retransmission. */ sctp_retransmit(&asoc->outqueue, cmd->obj.transport, SCTP_RTXR_T1_RTX); break; case SCTP_CMD_RETRAN: /* Mark a transport for retransmission. */ sctp_retransmit(&asoc->outqueue, cmd->obj.transport, SCTP_RTXR_T3_RTX); break; case SCTP_CMD_ECN_CE: /* Do delayed CE processing. */ sctp_do_ecn_ce_work(asoc, cmd->obj.u32); break; case SCTP_CMD_ECN_ECNE: /* Do delayed ECNE processing. */ new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32, chunk); if (new_obj) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_ECN_CWR: /* Do delayed CWR processing. */ sctp_do_ecn_cwr_work(asoc, cmd->obj.u32); break; case SCTP_CMD_SETUP_T2: sctp_cmd_setup_t2(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_TIMER_START_ONCE: timer = &asoc->timers[cmd->obj.to]; if (timer_pending(timer)) break; fallthrough; case SCTP_CMD_TIMER_START: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; BUG_ON(!timeout); /* * SCTP has a hard time with timer starts. Because we process * timer starts as side effects, it can be hard to tell if we * have already started a timer or not, which leads to BUG * halts when we call add_timer. So here, instead of just starting * a timer, if the timer is already started, and just mod * the timer with the shorter of the two expiration times */ if (!timer_pending(timer)) sctp_association_hold(asoc); timer_reduce(timer, jiffies + timeout); break; case SCTP_CMD_TIMER_RESTART: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); break; case SCTP_CMD_TIMER_STOP: timer = &asoc->timers[cmd->obj.to]; if (del_timer(timer)) sctp_association_put(asoc); break; case SCTP_CMD_INIT_CHOOSE_TRANSPORT: chunk = cmd->obj.chunk; t = sctp_assoc_choose_alter_transport(asoc, asoc->init_last_sent_to); asoc->init_last_sent_to = t; chunk->transport = t; t->init_sent_count++; /* Set the new transport as primary */ sctp_assoc_set_primary(asoc, t); break; case SCTP_CMD_INIT_RESTART: /* Do the needed accounting and updates * associated with restarting an initialization * timer. Only multiply the timeout by two if * all transports have been tried at the current * timeout. */ sctp_cmd_t1_timer_update(asoc, SCTP_EVENT_TIMEOUT_T1_INIT, "INIT"); sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); break; case SCTP_CMD_COOKIEECHO_RESTART: /* Do the needed accounting and updates * associated with restarting an initialization * timer. Only multiply the timeout by two if * all transports have been tried at the current * timeout. */ sctp_cmd_t1_timer_update(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE, "COOKIE"); /* If we've sent any data bundled with * COOKIE-ECHO we need to resend. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { sctp_retransmit_mark(&asoc->outqueue, t, SCTP_RTXR_T1_RTX); } sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); break; case SCTP_CMD_INIT_FAILED: sctp_cmd_init_failed(commands, asoc, cmd->obj.u16); break; case SCTP_CMD_ASSOC_FAILED: sctp_cmd_assoc_failed(commands, asoc, event_type, subtype, chunk, cmd->obj.u16); break; case SCTP_CMD_INIT_COUNTER_INC: asoc->init_err_counter++; break; case SCTP_CMD_INIT_COUNTER_RESET: asoc->init_err_counter = 0; asoc->init_cycle = 0; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->init_sent_count = 0; } break; case SCTP_CMD_REPORT_DUP: sctp_tsnmap_mark_dup(&asoc->peer.tsn_map, cmd->obj.u32); break; case SCTP_CMD_REPORT_BAD_TAG: pr_debug("%s: vtag mismatch!\n", __func__); break; case SCTP_CMD_STRIKE: /* Mark one strike against a transport. */ sctp_do_8_2_transport_strike(commands, asoc, cmd->obj.transport, 0); break; case SCTP_CMD_TRANSPORT_IDLE: t = cmd->obj.transport; sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); break; case SCTP_CMD_TRANSPORT_HB_SENT: t = cmd->obj.transport; sctp_do_8_2_transport_strike(commands, asoc, t, 1); t->hb_sent = 1; break; case SCTP_CMD_TRANSPORT_ON: t = cmd->obj.transport; sctp_cmd_transport_on(commands, asoc, t, chunk); break; case SCTP_CMD_HB_TIMERS_START: sctp_cmd_hb_timers_start(commands, asoc); break; case SCTP_CMD_HB_TIMER_UPDATE: t = cmd->obj.transport; sctp_transport_reset_hb_timer(t); break; case SCTP_CMD_HB_TIMERS_STOP: sctp_cmd_hb_timers_stop(commands, asoc); break; case SCTP_CMD_PROBE_TIMER_UPDATE: t = cmd->obj.transport; sctp_transport_reset_probe_timer(t); break; case SCTP_CMD_REPORT_ERROR: error = cmd->obj.error; break; case SCTP_CMD_PROCESS_CTSN: /* Dummy up a SACK for processing. */ sackh.cum_tsn_ack = cmd->obj.be32; sackh.a_rwnd = htonl(asoc->peer.rwnd + asoc->outqueue.outstanding_bytes); sackh.num_gap_ack_blocks = 0; sackh.num_dup_tsns = 0; chunk->subh.sack_hdr = &sackh; sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk)); break; case SCTP_CMD_DISCARD_PACKET: /* We need to discard the whole packet. * Uncork the queue since there might be * responses pending */ chunk->pdiscard = 1; if (asoc) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } break; case SCTP_CMD_RTO_PENDING: t = cmd->obj.transport; t->rto_pending = 1; break; case SCTP_CMD_PART_DELIVER: asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC); break; case SCTP_CMD_RENEGE: asoc->stream.si->renege_events(&asoc->ulpq, cmd->obj.chunk, GFP_ATOMIC); break; case SCTP_CMD_SETUP_T4: sctp_cmd_setup_t4(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_PROCESS_OPERR: sctp_cmd_process_operr(commands, asoc, chunk); break; case SCTP_CMD_CLEAR_INIT_TAG: asoc->peer.i.init_tag = 0; break; case SCTP_CMD_DEL_NON_PRIMARY: sctp_cmd_del_non_primary(asoc); break; case SCTP_CMD_T3_RTX_TIMERS_STOP: sctp_cmd_t3_rtx_timers_stop(commands, asoc); break; case SCTP_CMD_FORCE_PRIM_RETRAN: t = asoc->peer.retran_path; asoc->peer.retran_path = asoc->peer.primary_path; sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; asoc->peer.retran_path = t; break; case SCTP_CMD_SET_SK_ERR: sctp_cmd_set_sk_err(asoc, cmd->obj.error); break; case SCTP_CMD_ASSOC_CHANGE: sctp_cmd_assoc_change(commands, asoc, cmd->obj.u8); break; case SCTP_CMD_ADAPTATION_IND: sctp_cmd_adaptation_ind(commands, asoc); break; case SCTP_CMD_PEER_NO_AUTH: sctp_cmd_peer_no_auth(commands, asoc); break; case SCTP_CMD_ASSOC_SHKEY: error = sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC); break; case SCTP_CMD_UPDATE_INITTAG: asoc->peer.i.init_tag = cmd->obj.u32; break; case SCTP_CMD_SEND_MSG: if (!asoc->outqueue.cork) { sctp_outq_cork(&asoc->outqueue); local_cork = 1; } sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; case SCTP_CMD_PURGE_ASCONF_QUEUE: sctp_asconf_queue_teardown(asoc); break; case SCTP_CMD_SET_ASOC: if (asoc && local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } asoc = cmd->obj.asoc; break; default: pr_warn("Impossible command: %u\n", cmd->verb); break; } if (error) { cmd = sctp_next_cmd(commands); while (cmd) { if (cmd->verb == SCTP_CMD_REPLY) sctp_chunk_free(cmd->obj.chunk); cmd = sctp_next_cmd(commands); } break; } } /* If this is in response to a received chunk, wait until * we are done with the packet to open the queue so that we don't * send multiple packets in response to a single request. */ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { if (chunk->end_of_packet || chunk->singleton) sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) sctp_outq_uncork(&asoc->outqueue, gfp); if (sp->data_ready_signalled) sp->data_ready_signalled = 0; return error; } |
151 151 143 144 80 5 116 143 130 42 1 73 119 121 108 3 121 151 150 106 105 108 48 48 48 12 8 149 10 7 142 2 150 17 5 149 151 3 149 26 25 2 150 148 150 150 26 150 150 14 149 2 2 2 2 2 2 2 121 121 10 24 17 120 122 2 122 26 119 122 2 2 2 2 119 9 56 90 149 149 150 151 122 121 121 121 1 1 3 42 105 121 26 121 26 150 81 81 10 26 26 8 3 9 2 150 141 62 7 14 14 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions handle output processing. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/init.h> #include <linux/slab.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> /* Forward declarations for private helpers. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len); static void sctp_packet_reset(struct sctp_packet *packet) { /* sctp_packet_transmit() relies on this to reset size to the * current overhead after sending packets. */ packet->size = packet->overhead; packet->has_cookie_echo = 0; packet->has_sack = 0; packet->has_data = 0; packet->has_auth = 0; packet->ipfragok = 0; packet->auth = NULL; } /* Config a packet. * This appears to be a followup set of initializations. */ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_sock *sp = NULL; struct sock *sk; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; /* do the following jobs only once for a flush schedule */ if (!sctp_packet_empty(packet)) return; /* set packet max_size with pathmtu, then calculate overhead */ packet->max_size = tp->pathmtu; if (asoc) { sk = asoc->base.sk; sp = sctp_sk(sk); } packet->overhead = sctp_mtu_payload(sp, 0, 0); packet->size = packet->overhead; if (!asoc) return; /* update dst or transport pathmtu if in need */ if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); } else if (!sctp_transport_pl_enabled(tp) && asoc->param_flags & SPP_PMTUD_ENABLE) { if (!sctp_transport_pmtu_check(tp)) sctp_assoc_sync_pmtu(asoc); } if (asoc->pmtu_pending) { if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ if (ecn_capable) { struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } if (!tp->dst) return; /* set packet max_size with gso_max_size if gso is enabled*/ rcu_read_lock(); if (__sk_dst_get(sk) != tp->dst) { dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } /* Initialize the packet structure. */ void sctp_packet_init(struct sctp_packet *packet, struct sctp_transport *transport, __u16 sport, __u16 dport) { pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport); packet->transport = transport; packet->source_port = sport; packet->destination_port = dport; INIT_LIST_HEAD(&packet->chunk_list); /* The overhead will be calculated by sctp_packet_config() */ packet->overhead = 0; sctp_packet_reset(packet); packet->vtag = 0; } /* Free a packet. */ void sctp_packet_free(struct sctp_packet *packet) { struct sctp_chunk *chunk, *tmp; pr_debug("%s: packet:%p\n", __func__, packet); list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } /* This routine tries to append the chunk to the offered packet. If adding * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk * is not present in the packet, it transmits the input packet. * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long * as it can fit in the packet, but any more data that does not fit in this * packet can be sent only after receiving the COOKIE_ACK. */ enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, int one_packet, gfp_t gfp) { enum sctp_xmit retval; pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { int error = 0; error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; /* If we have an empty packet, then we can NOT ever * return PMTU_FULL. */ if (!one_packet) retval = sctp_packet_append_chunk(packet, chunk); } break; case SCTP_XMIT_RWND_FULL: case SCTP_XMIT_OK: case SCTP_XMIT_DELAY: break; } return retval; } /* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */ static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_transport *t = pkt->transport; struct sctp_chunk *pad; int overhead = 0; if (!chunk->pmtu_probe) return SCTP_XMIT_OK; /* calculate the Padding Data size for the pad chunk */ overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk); pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead); if (!pad) return SCTP_XMIT_DELAY; list_add_tail(&pad->list, &pkt->chunk_list); pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length)); chunk->transport = t; return SCTP_XMIT_OK; } /* Try to bundle an auth chunk into the packet. */ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_association *asoc = pkt->transport->asoc; enum sctp_xmit retval = SCTP_XMIT_OK; struct sctp_chunk *auth; /* if we don't have an association, we can't do authentication */ if (!asoc) return retval; /* See if this is an auth chunk we are bundling or if * auth is already bundled. */ if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth) return retval; /* if the peer did not request this chunk to be authenticated, * don't do it */ if (!chunk->auth) return retval; auth = sctp_make_auth(asoc, chunk->shkey->key_id); if (!auth) return retval; auth->shkey = chunk->shkey; sctp_auth_shkey_hold(auth->shkey); retval = __sctp_packet_append_chunk(pkt, auth); if (retval != SCTP_XMIT_OK) sctp_chunk_free(auth); return retval; } /* Try to bundle a SACK with the packet. */ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; /* If sending DATA and haven't aleady bundled a SACK, try to * bundle one in to the packet. */ if (sctp_chunk_is_data(chunk) && !pkt->has_sack && !pkt->has_cookie_echo) { struct sctp_association *asoc; struct timer_list *timer; asoc = pkt->transport->asoc; timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; /* If the SACK timer is running, we have a pending SACK */ if (timer_pending(timer)) { struct sctp_chunk *sack; if (pkt->transport->sack_generation != pkt->transport->asoc->peer.sack_generation) return retval; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (sack) { retval = __sctp_packet_append_chunk(pkt, sack); if (retval != SCTP_XMIT_OK) { sctp_chunk_free(sack); goto out; } SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; if (del_timer(timer)) sctp_association_put(asoc); } } } out: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); enum sctp_xmit retval = SCTP_XMIT_OK; /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); if (retval != SCTP_XMIT_OK) goto finish; /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { case SCTP_CID_DATA: case SCTP_CID_I_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ packet->has_sack = 1; /* Disallow AUTH bundling after DATA */ packet->has_auth = 1; /* Let it be knows that packet has DATA in it */ packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; /* Mainly used for prsctp RTX policy */ chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; break; case SCTP_CID_SACK: packet->has_sack = 1; if (chunk->asoc) chunk->asoc->stats.osacks++; break; case SCTP_CID_AUTH: packet->has_auth = 1; packet->auth = chunk; break; } /* It is OK to send this chunk. */ list_add_tail(&chunk->list, &packet->chunk_list); packet->size += chunk_len; chunk->transport = packet->transport; finish: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); /* Data chunks are special. Before seeing what else we can * bundle into this packet, check to see if we are allowed to * send this DATA. */ if (sctp_chunk_is_data(chunk)) { retval = sctp_packet_can_append_data(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; } /* Try to bundle AUTH chunk */ retval = sctp_packet_bundle_auth(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; /* Try to bundle SACK chunk */ retval = sctp_packet_bundle_sack(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = __sctp_packet_append_chunk(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = sctp_packet_bundle_pad(packet, chunk); finish: return retval; } static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) { if (SCTP_OUTPUT_CB(head)->last == head) skb_shinfo(head)->frag_list = skb; else SCTP_OUTPUT_CB(head)->last->next = skb; SCTP_OUTPUT_CB(head)->last = skb; head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } static int sctp_packet_pack(struct sctp_packet *packet, struct sk_buff *head, int gso, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_auth_chunk *auth = NULL; struct sctp_chunk *chunk, *tmp; int pkt_count = 0, pkt_size; struct sock *sk = head->sk; struct sk_buff *nskb; int auth_len = 0; if (gso) { skb_shinfo(head)->gso_type = sk->sk_gso_type; SCTP_OUTPUT_CB(head)->last = head; } else { nskb = head; pkt_size = packet->size; goto merge; } do { /* calculate the pkt_size and alloc nskb */ pkt_size = packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padded = SCTP_PAD4(chunk->skb->len); if (chunk == packet->auth) auth_len = padded; else if (auth_len + padded + packet->overhead > tp->pathmtu) return 0; else if (pkt_size + padded > tp->pathmtu) break; pkt_size += padded; } nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); if (!nskb) return 0; skb_reserve(nskb, packet->overhead + MAX_HEADER); merge: /* merge chunks into nskb and append nskb into head list */ pkt_size -= packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padding; list_del_init(&chunk->list); if (sctp_chunk_is_data(chunk)) { if (!sctp_chunk_retransmitted(chunk) && !tp->rto_pending) { chunk->rtt_in_progress = 1; tp->rto_pending = 1; } } padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) skb_put_zero(chunk->skb, padding); if (chunk == packet->auth) auth = (struct sctp_auth_chunk *) skb_tail_pointer(nskb); skb_put_data(nskb, chunk->skb->data, chunk->skb->len); pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), chunk->has_tsn ? "TSN" : "No TSN", chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, ntohs(chunk->chunk_hdr->length), chunk->skb->len, chunk->rtt_in_progress); pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); if (!pkt_size) break; } if (auth) { sctp_auth_calculate_hmac(tp->asoc, nskb, auth, packet->auth->shkey, gfp); /* free auth if no more chunks, or add it back */ if (list_empty(&packet->chunk_list)) sctp_chunk_free(packet->auth); else list_add(&packet->auth->list, &packet->chunk_list); } if (gso) sctp_packet_gso_append(head, nskb); pkt_count++; } while (!list_empty(&packet->chunk_list)); if (gso) { memset(head->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); skb_shinfo(head)->gso_segs = pkt_count; skb_shinfo(head)->gso_size = GSO_BY_FRAGS; goto chksum; } if (sctp_checksum_disable) return 1; if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) || dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) { struct sctphdr *sh = (struct sctphdr *)skb_transport_header(head); sh->checksum = sctp_compute_cksum(head, 0); } else { chksum: head->ip_summed = CHECKSUM_PARTIAL; head->csum_not_inet = 1; head->csum_start = skb_transport_header(head) - head->head; head->csum_offset = offsetof(struct sctphdr, checksum); } return pkt_count; } /* All packets are sent to the network through this function from * sctp_outq_tail(). * * The return value is always 0 for now. */ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; struct sk_buff *head; struct sctphdr *sh; struct sock *sk; pr_debug("%s: packet:%p\n", __func__, packet); if (list_empty(&packet->chunk_list)) return 0; chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */ packet->ipfragok = 1; } else { if (!sk_can_gso(sk)) { /* check gso */ pr_err_once("Trying to GSO but underlying device doesn't support it."); goto out; } gso = 1; } } /* alloc head skb */ head = alloc_skb((gso ? packet->overhead : packet->size) + MAX_HEADER, gfp); if (!head) goto out; skb_reserve(head, packet->overhead + MAX_HEADER); skb_set_owner_w(head, sk); /* set sctp header */ sh = skb_push(head, sizeof(struct sctphdr)); skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); sh->vtag = htonl(packet->vtag); sh->checksum = 0; /* drop packet if no dst */ if (!tp->dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(head); goto out; } /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); if (!pkt_count) { kfree_skb(head); goto out; } pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); /* start autoclose timer */ if (packet->has_data && sctp_state(asoc, ESTABLISHED) && asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { struct timer_list *timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; unsigned long timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); } /* sctp xmit */ tp->af_specific->ecn_capable(sk); if (asoc) { asoc->stats.opackets += pkt_count; if (asoc->peer.last_sent_to != tp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; if (tp->dst_pending_confirm) skb_set_dst_pending_confirm(head, 1); /* neighbour should be confirmed on successful transmission or * positive error */ if (tp->af_specific->sctp_xmit(head, tp) >= 0 && tp->dst_pending_confirm) tp->dst_pending_confirm = 0; out: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); if (!sctp_chunk_is_data(chunk)) sctp_chunk_free(chunk); } sctp_packet_reset(packet); return 0; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private function check to see if a chunk can be added */ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { size_t datasize, rwnd, inflight, flight_size; struct sctp_transport *transport = packet->transport; struct sctp_association *asoc = transport->asoc; struct sctp_outq *q = &asoc->outqueue; /* RFC 2960 6.1 Transmission of DATA Chunks * * A) At any given time, the data sender MUST NOT transmit new data to * any destination transport address if its peer's rwnd indicates * that the peer has no buffer space (i.e. rwnd is 0, see Section * 6.2.1). However, regardless of the value of rwnd (including if it * is 0), the data sender can always have one DATA chunk in flight to * the receiver if allowed by cwnd (see rule B below). This rule * allows the sender to probe for a change in rwnd that the sender * missed due to the SACK having been lost in transit from the data * receiver to the data sender. */ rwnd = asoc->peer.rwnd; inflight = q->outstanding_bytes; flight_size = transport->flight_size; datasize = sctp_data_size(chunk); if (datasize > rwnd && inflight > 0) /* We have (at least) one data chunk in flight, * so we can't fall back to rule 6.1 B). */ return SCTP_XMIT_RWND_FULL; /* RFC 2960 6.1 Transmission of DATA Chunks * * B) At any given time, the sender MUST NOT transmit new data * to a given transport address if it has cwnd or more bytes * of data outstanding to that transport address. */ /* RFC 7.2.4 & the Implementers Guide 2.8. * * 3) ... * When a Fast Retransmit is being performed the sender SHOULD * ignore the value of cwnd and SHOULD NOT delay retransmission. */ if (chunk->fast_retransmit != SCTP_NEED_FRTX && flight_size >= transport->cwnd) return SCTP_XMIT_RWND_FULL; /* Nagle's algorithm to solve small-packet problem: * Inhibit the sending of new chunks when new outgoing data arrives * if any previously transmitted data on the connection remains * unacknowledged. */ if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) && !asoc->force_delay) /* Nothing unacked */ return SCTP_XMIT_OK; if (!sctp_packet_empty(packet)) /* Append to packet */ return SCTP_XMIT_OK; if (!sctp_state(asoc, ESTABLISHED)) return SCTP_XMIT_OK; /* Check whether this chunk and all the rest of pending data will fit * or delay in hopes of bundling a full sized packet. */ if (chunk->skb->len + q->out_qlen > transport->pathmtu - packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; /* Don't delay large message writes that may have been fragmented */ if (!chunk->msg->can_delay) return SCTP_XMIT_OK; /* Defer until all data acked or packet full */ return SCTP_XMIT_DELAY; } /* This private function does management things when adding DATA chunk */ static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { struct sctp_transport *transport = packet->transport; size_t datasize = sctp_data_size(chunk); struct sctp_association *asoc = transport->asoc; u32 rwnd = asoc->peer.rwnd; /* Keep track of how many bytes are in flight over this transport. */ transport->flight_size += datasize; /* Keep track of how many bytes are in flight to the receiver. */ asoc->outqueue.outstanding_bytes += datasize; /* Update our view of the receiver's rwnd. */ if (datasize < rwnd) rwnd -= datasize; else rwnd = 0; asoc->peer.rwnd = rwnd; sctp_chunk_assign_tsn(chunk); asoc->stream.si->assign_number(chunk); } static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { enum sctp_xmit retval = SCTP_XMIT_OK; size_t psize, pmtu, maxsize; /* Don't bundle in this packet if this chunk's auth key doesn't * match other chunks already enqueued on this packet. Also, * don't bundle the chunk with auth key if other chunks in this * packet don't have auth key. */ if ((packet->auth && chunk->shkey != packet->auth->shkey) || (!packet->auth && chunk->shkey && chunk->chunk_hdr->type != SCTP_CID_AUTH)) return SCTP_XMIT_PMTU_FULL; psize = packet->size; if (packet->transport->asoc) pmtu = packet->transport->asoc->pathmtu; else pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ if (psize + chunk_len > pmtu) { /* It's OK to fragment at IP level if any one of the following * is true: * 1. The packet is empty (meaning this chunk is greater * the MTU) * 2. The packet doesn't have any data in it yet and data * requires authentication. */ if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; goto out; } /* Similarly, if this chunk was built before a PMTU * reduction, we have to fragment it at IP level now. So * if the packet already contains something, we need to * flush. */ maxsize = pmtu - packet->overhead; if (packet->auth) maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; /* It is also okay to fragment if the chunk we are * adding is a control chunk, but only if current packet * is not a GSO one otherwise it causes fragmentation of * a large frame. So in this case we allow the * fragmentation by forcing it to be in a new packet. */ if (!sctp_chunk_is_data(chunk) && packet->has_data) retval = SCTP_XMIT_PMTU_FULL; if (psize + chunk_len > packet->max_size) /* Hit GSO/PMTU limit, gotta flush */ retval = SCTP_XMIT_PMTU_FULL; if (!packet->transport->burst_limited && psize + chunk_len > (packet->transport->cwnd >> 1)) /* Do not allow a single GSO packet to use more * than half of cwnd. */ retval = SCTP_XMIT_PMTU_FULL; if (packet->transport->burst_limited && psize + chunk_len > (packet->transport->burst_limited >> 1)) /* Do not allow a single GSO packet to use more * than half of original cwnd. */ retval = SCTP_XMIT_PMTU_FULL; /* Otherwise it will fit in the GSO packet */ } out: return retval; } |
2 2 2 2 1 1 6 4 2 1 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007-2008 BalaBit IT Ltd. * Author: Krisztian Kovacs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <net/tcp.h> #include <net/udp.h> #include <net/icmp.h> #include <net/sock.h> #include <net/inet_sock.h> #include <net/netfilter/nf_socket.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif static int extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, __be32 *raddr, __be32 *laddr, __be16 *rport, __be16 *lport) { unsigned int outside_hdrlen = ip_hdrlen(skb); struct iphdr *inside_iph, _inside_iph; struct icmphdr *icmph, _icmph; __be16 *ports, _ports[2]; icmph = skb_header_pointer(skb, outside_hdrlen, sizeof(_icmph), &_icmph); if (icmph == NULL) return 1; if (!icmp_is_err(icmph->type)) return 1; inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr), sizeof(_inside_iph), &_inside_iph); if (inside_iph == NULL) return 1; if (inside_iph->protocol != IPPROTO_TCP && inside_iph->protocol != IPPROTO_UDP) return 1; ports = skb_header_pointer(skb, outside_hdrlen + sizeof(struct icmphdr) + (inside_iph->ihl << 2), sizeof(_ports), &_ports); if (ports == NULL) return 1; /* the inside IP packet is the one quoted from our side, thus * its saddr is the local address */ *protocol = inside_iph->protocol; *laddr = inside_iph->saddr; *lport = ports[0]; *raddr = inside_iph->daddr; *rport = ports[1]; return 0; } static struct sock * nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, const struct net_device *in) { switch (protocol) { case IPPROTO_TCP: return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); } return NULL; } struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, const struct net_device *indev) { __be32 daddr, saddr; __be16 dport, sport; const struct iphdr *iph = ip_hdr(skb); struct sk_buff *data_skb = NULL; u8 protocol; #if IS_ENABLED(CONFIG_NF_CONNTRACK) enum ip_conntrack_info ctinfo; struct nf_conn const *ct; #endif int doff = 0; if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { struct tcphdr _hdr; struct udphdr *hp; hp = skb_header_pointer(skb, ip_hdrlen(skb), iph->protocol == IPPROTO_UDP ? sizeof(*hp) : sizeof(_hdr), &_hdr); if (hp == NULL) return NULL; protocol = iph->protocol; saddr = iph->saddr; sport = hp->source; daddr = iph->daddr; dport = hp->dest; data_skb = (struct sk_buff *)skb; doff = iph->protocol == IPPROTO_TCP ? ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) : ip_hdrlen(skb) + sizeof(*hp); } else if (iph->protocol == IPPROTO_ICMP) { if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, &sport, &dport)) return NULL; } else { return NULL; } #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Do the lookup with the original socket address in * case this is a reply packet of an established * SNAT-ted connection. */ ct = nf_ct_get(skb, &ctinfo); if (ct && ((iph->protocol != IPPROTO_ICMP && ctinfo == IP_CT_ESTABLISHED_REPLY) || (iph->protocol == IPPROTO_ICMP && ctinfo == IP_CT_RELATED_REPLY)) && (ct->status & IPS_SRC_NAT_DONE)) { daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; dport = (iph->protocol == IPPROTO_TCP) ? ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; } #endif return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr, daddr, sport, dport, indev); } EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); MODULE_DESCRIPTION("Netfilter IPv4 socket lookup infrastructure"); |
677 683 1 1 377 379 673 674 673 696 696 675 387 681 679 382 384 381 379 384 341 675 677 675 673 677 372 339 678 683 676 665 11 379 379 376 380 380 378 678 5 1 257 451 675 474 469 5 36 36 386 383 386 383 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 | // SPDX-License-Identifier: GPL-2.0 /* * KFENCE guarded object allocator and fault handling. * * Copyright (C) 2020, Google LLC. */ #define pr_fmt(fmt) "kfence: " fmt #include <linux/atomic.h> #include <linux/bug.h> #include <linux/debugfs.h> #include <linux/hash.h> #include <linux/irq_work.h> #include <linux/jhash.h> #include <linux/kcsan-checks.h> #include <linux/kfence.h> #include <linux/kmemleak.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/log2.h> #include <linux/memblock.h> #include <linux/moduleparam.h> #include <linux/notifier.h> #include <linux/panic_notifier.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> #include <asm/kfence.h> #include "kfence.h" /* Disables KFENCE on the first warning assuming an irrecoverable error. */ #define KFENCE_WARN_ON(cond) \ ({ \ const bool __cond = WARN_ON(cond); \ if (unlikely(__cond)) { \ WRITE_ONCE(kfence_enabled, false); \ disabled_by_warn = true; \ } \ __cond; \ }) /* === Data ================================================================= */ static bool kfence_enabled __read_mostly; static bool disabled_by_warn __read_mostly; unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif #define MODULE_PARAM_PREFIX "kfence." static int kfence_enable_late(void); static int param_set_sample_interval(const char *val, const struct kernel_param *kp) { unsigned long num; int ret = kstrtoul(val, 0, &num); if (ret < 0) return ret; /* Using 0 to indicate KFENCE is disabled. */ if (!num && READ_ONCE(kfence_enabled)) { pr_info("disabled\n"); WRITE_ONCE(kfence_enabled, false); } *((unsigned long *)kp->arg) = num; if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) return disabled_by_warn ? -EINVAL : kfence_enable_late(); return 0; } static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) { if (!READ_ONCE(kfence_enabled)) return sprintf(buffer, "0\n"); return param_get_ulong(buffer, kp); } static const struct kernel_param_ops sample_interval_param_ops = { .set = param_set_sample_interval, .get = param_get_sample_interval, }; module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); /* Pool usage% threshold when currently covered allocations are skipped. */ static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); /* Allocation burst count: number of excess KFENCE allocations per sample. */ static unsigned int kfence_burst __read_mostly; module_param_named(burst, kfence_burst, uint, 0644); /* If true, use a deferrable timer. */ static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); module_param_named(deferrable, kfence_deferrable, bool, 0444); /* If true, check all canary bytes on panic. */ static bool kfence_check_on_panic __read_mostly; module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444); /* The pool of pages used for guard pages and objects. */ char *__kfence_pool __read_mostly; EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ /* * Per-object metadata, with one-to-one mapping of object metadata to * backing pages (in __kfence_pool). */ static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); struct kfence_metadata *kfence_metadata __read_mostly; /* * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache(). * So introduce kfence_metadata_init to initialize metadata, and then make * kfence_metadata visible after initialization is successful. This prevents * potential UAF or access to uninitialized metadata. */ static struct kfence_metadata *kfence_metadata_init __read_mostly; /* Freelist with available objects. */ static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ /* * The static key to set up a KFENCE allocation; or if static keys are not used * to gate allocations, to avoid a load and compare if KFENCE is disabled. */ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); /* Gates the allocation, ensuring only one succeeds in a given period. */ atomic_t kfence_allocation_gate = ATOMIC_INIT(1); /* * A Counting Bloom filter of allocation coverage: limits currently covered * allocations of the same source filling up the pool. * * Assuming a range of 15%-85% unique allocations in the pool at any point in * time, the below parameters provide a probablity of 0.02-0.33 for false * positive hits respectively: * * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM */ #define ALLOC_COVERED_HNUM 2 #define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2) #define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER) #define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER) #define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1) static atomic_t alloc_covered[ALLOC_COVERED_SIZE]; /* Stack depth used to determine uniqueness of an allocation. */ #define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8) /* * Randomness for stack hashes, making the same collisions across reboots and * different machines less likely. */ static u32 stack_hash_seed __ro_after_init; /* Statistics counters for debugfs. */ enum kfence_counter_id { KFENCE_COUNTER_ALLOCATED, KFENCE_COUNTER_ALLOCS, KFENCE_COUNTER_FREES, KFENCE_COUNTER_ZOMBIES, KFENCE_COUNTER_BUGS, KFENCE_COUNTER_SKIP_INCOMPAT, KFENCE_COUNTER_SKIP_CAPACITY, KFENCE_COUNTER_SKIP_COVERED, KFENCE_COUNTER_COUNT, }; static atomic_long_t counters[KFENCE_COUNTER_COUNT]; static const char *const counter_names[] = { [KFENCE_COUNTER_ALLOCATED] = "currently allocated", [KFENCE_COUNTER_ALLOCS] = "total allocations", [KFENCE_COUNTER_FREES] = "total frees", [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", [KFENCE_COUNTER_BUGS] = "total bugs", [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)", }; static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); /* === Internals ============================================================ */ static inline bool should_skip_covered(void) { unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100; return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh; } static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries) { num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH); num_entries = filter_irq_stacks(stack_entries, num_entries); return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed); } /* * Adds (or subtracts) count @val for allocation stack trace hash * @alloc_stack_hash from Counting Bloom filter. */ static void alloc_covered_add(u32 alloc_stack_hash, int val) { int i; for (i = 0; i < ALLOC_COVERED_HNUM; i++) { atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]); alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); } } /* * Returns true if the allocation stack trace hash @alloc_stack_hash is * currently contained (non-zero count) in Counting Bloom filter. */ static bool alloc_covered_contains(u32 alloc_stack_hash) { int i; for (i = 0; i < ALLOC_COVERED_HNUM; i++) { if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK])) return false; alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); } return true; } static bool kfence_protect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); } static bool kfence_unprotect(unsigned long addr) { return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); } static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) { unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; /* The checks do not affect performance; only called from slow-paths. */ /* Only call with a pointer into kfence_metadata. */ if (KFENCE_WARN_ON(meta < kfence_metadata || meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) return 0; /* * This metadata object only ever maps to 1 page; verify that the stored * address is in the expected range. */ if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) return 0; return pageaddr; } static inline bool kfence_obj_allocated(const struct kfence_metadata *meta) { enum kfence_object_state state = READ_ONCE(meta->state); return state == KFENCE_OBJECT_ALLOCATED || state == KFENCE_OBJECT_RCU_FREEING; } /* * Update the object's metadata state, including updating the alloc/free stacks * depending on the state transition. */ static noinline void metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next, unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_track *track = next == KFENCE_OBJECT_ALLOCATED ? &meta->alloc_track : &meta->free_track; lockdep_assert_held(&meta->lock); /* Stack has been saved when calling rcu, skip. */ if (READ_ONCE(meta->state) == KFENCE_OBJECT_RCU_FREEING) goto out; if (stack_entries) { memcpy(track->stack_entries, stack_entries, num_stack_entries * sizeof(stack_entries[0])); } else { /* * Skip over 1 (this) functions; noinline ensures we do not * accidentally skip over the caller by never inlining. */ num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); } track->num_stack_entries = num_stack_entries; track->pid = task_pid_nr(current); track->cpu = raw_smp_processor_id(); track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ out: /* * Pairs with READ_ONCE() in * kfence_shutdown_cache(), * kfence_handle_page_fault(). */ WRITE_ONCE(meta->state, next); } #ifdef CONFIG_KMSAN #define check_canary_attributes noinline __no_kmsan_checks #else #define check_canary_attributes inline #endif /* Check canary byte at @addr. */ static check_canary_attributes bool check_canary_byte(u8 *addr) { struct kfence_metadata *meta; unsigned long flags; if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr))) return true; atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); meta = addr_to_metadata((unsigned long)addr); raw_spin_lock_irqsave(&meta->lock, flags); kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); raw_spin_unlock_irqrestore(&meta->lock, flags); return false; } static inline void set_canary(const struct kfence_metadata *meta) { const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr = pageaddr; /* * The canary may be written to part of the object memory, but it does * not affect it. The user should initialize the object before using it. */ for (; addr < meta->addr; addr += sizeof(u64)) *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64)); for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64; } static check_canary_attributes void check_canary(const struct kfence_metadata *meta) { const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); unsigned long addr = pageaddr; /* * We'll iterate over each canary byte per-side until a corrupted byte * is found. However, we'll still iterate over the canary bytes to the * right of the object even if there was an error in the canary bytes to * the left of the object. Specifically, if check_canary_byte() * generates an error, showing both sides might give more clues as to * what the error is about when displaying which bytes were corrupted. */ /* Apply to left of object. */ for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) { if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) break; } /* * If the canary is corrupted in a certain 64 bytes, or the canary * memory cannot be completely covered by multiple consecutive 64 bytes, * it needs to be checked one by one. */ for (; addr < meta->addr; addr++) { if (unlikely(!check_canary_byte((u8 *)addr))) break; } /* Apply to right of object. */ for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) { if (unlikely(!check_canary_byte((u8 *)addr))) return; } for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) { if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) { for (; addr - pageaddr < PAGE_SIZE; addr++) { if (!check_canary_byte((u8 *)addr)) return; } } } } static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, unsigned long *stack_entries, size_t num_stack_entries, u32 alloc_stack_hash) { struct kfence_metadata *meta = NULL; unsigned long flags; struct slab *slab; void *addr; const bool random_right_allocate = get_random_u32_below(2); const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS); /* Try to obtain a free object. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); if (!list_empty(&kfence_freelist)) { meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); list_del_init(&meta->list); } raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); if (!meta) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]); return NULL; } if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { /* * This is extremely unlikely -- we are reporting on a * use-after-free, which locked meta->lock, and the reporting * code via printk calls kmalloc() which ends up in * kfence_alloc() and tries to grab the same object that we're * reporting on. While it has never been observed, lockdep does * report that there is a possibility of deadlock. Fix it by * using trylock and bailing out gracefully. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); /* Put the object back on the freelist. */ list_add_tail(&meta->list, &kfence_freelist); raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); return NULL; } meta->addr = metadata_to_pageaddr(meta); /* Unprotect if we're reusing this page. */ if (meta->state == KFENCE_OBJECT_FREED) kfence_unprotect(meta->addr); /* * Note: for allocations made before RNG initialization, will always * return zero. We still benefit from enabling KFENCE as early as * possible, even when the RNG is not yet available, as this will allow * KFENCE to detect bugs due to earlier allocations. The only downside * is that the out-of-bounds accesses detected are deterministic for * such allocations. */ if (random_right_allocate) { /* Allocate on the "right" side, re-calculate address. */ meta->addr += PAGE_SIZE - size; meta->addr = ALIGN_DOWN(meta->addr, cache->align); } addr = (void *)meta->addr; /* Update remaining metadata. */ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries); /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ WRITE_ONCE(meta->cache, cache); meta->size = size; meta->alloc_stack_hash = alloc_stack_hash; raw_spin_unlock_irqrestore(&meta->lock, flags); alloc_covered_add(alloc_stack_hash, 1); /* Set required slab fields. */ slab = virt_to_slab((void *)meta->addr); slab->slab_cache = cache; slab->objects = 1; /* Memory initialization. */ set_canary(meta); /* * We check slab_want_init_on_alloc() ourselves, rather than letting * SL*B do the initialization, as otherwise we might overwrite KFENCE's * redzone. */ if (unlikely(slab_want_init_on_alloc(gfp, cache))) memzero_explicit(addr, size); if (cache->ctor) cache->ctor(addr); if (random_fault) kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); return addr; } static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) { struct kcsan_scoped_access assert_page_exclusive; unsigned long flags; bool init; raw_spin_lock_irqsave(&meta->lock, flags); if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_INVALID_FREE); raw_spin_unlock_irqrestore(&meta->lock, flags); return; } /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, &assert_page_exclusive); if (CONFIG_KFENCE_STRESS_TEST_FAULTS) kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ /* Restore page protection if there was an OOB access. */ if (meta->unprotected_page) { memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); kfence_protect(meta->unprotected_page); meta->unprotected_page = 0; } /* Mark the object as freed. */ metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); init = slab_want_init_on_free(meta->cache); raw_spin_unlock_irqrestore(&meta->lock, flags); alloc_covered_add(meta->alloc_stack_hash, -1); /* Check canary bytes for memory corruption. */ check_canary(meta); /* * Clear memory if init-on-free is set. While we protect the page, the * data is still there, and after a use-after-free is detected, we * unprotect the page, so the data is still accessible. */ if (!zombie && unlikely(init)) memzero_explicit(addr, meta->size); /* Protect to detect use-after-frees. */ kfence_protect((unsigned long)addr); kcsan_end_scoped_access(&assert_page_exclusive); if (!zombie) { /* Add it to the tail of the freelist for reuse. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); KFENCE_WARN_ON(!list_empty(&meta->list)); list_add_tail(&meta->list, &kfence_freelist); raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); } else { /* See kfence_shutdown_cache(). */ atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); } } static void rcu_guarded_free(struct rcu_head *h) { struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); kfence_guarded_free((void *)meta->addr, meta, false); } /* * Initialization of the KFENCE pool after its allocation. * Returns 0 on success; otherwise returns the address up to * which partial initialization succeeded. */ static unsigned long kfence_init_pool(void) { unsigned long addr; struct page *pages; int i; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; addr = (unsigned long)__kfence_pool; pages = virt_to_page(__kfence_pool); /* * Set up object pages: they must have PG_slab set, to avoid freeing * these as real pages. * * We also want to avoid inserting kfence_free() in the kfree() * fast-path in SLUB, and therefore need to ensure kfree() correctly * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | MEMCG_DATA_OBJEXTS; #endif } /* * Protect the first 2 pages. The first page is mostly unnecessary, and * merely serves as an extended guard page. However, adding one * additional page in the beginning gives us an even number of pages, * which simplifies the mapping of address to metadata index. */ for (i = 0; i < 2; i++) { if (unlikely(!kfence_protect(addr))) return addr; addr += PAGE_SIZE; } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata_init[i]; /* Initialize metadata. */ INIT_LIST_HEAD(&meta->list); raw_spin_lock_init(&meta->lock); meta->state = KFENCE_OBJECT_UNUSED; meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ list_add_tail(&meta->list, &kfence_freelist); /* Protect the right redzone. */ if (unlikely(!kfence_protect(addr + PAGE_SIZE))) goto reset_slab; addr += 2 * PAGE_SIZE; } /* * Make kfence_metadata visible only when initialization is successful. * Otherwise, if the initialization fails and kfence_metadata is freed, * it may cause UAF in kfence_shutdown_cache(). */ smp_store_release(&kfence_metadata, kfence_metadata_init); return 0; reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct slab *slab = page_slab(nth_page(pages, i)); if (!i || (i % 2)) continue; #ifdef CONFIG_MEMCG slab->obj_exts = 0; #endif __folio_clear_slab(slab_folio(slab)); } return addr; } static bool __init kfence_init_pool_early(void) { unsigned long addr; if (!__kfence_pool) return false; addr = kfence_init_pool(); if (!addr) { /* * The pool is live and will never be deallocated from this point on. * Ignore the pool object from the kmemleak phys object tree, as it would * otherwise overlap with allocations returned by kfence_alloc(), which * are registered with kmemleak through the slab post-alloc hook. */ kmemleak_ignore_phys(__pa(__kfence_pool)); return true; } /* * Only release unprotected pages, and do not try to go back and change * page attributes due to risk of failing to do so as well. If changing * page attributes for some pages fails, it is very likely that it also * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); kfence_metadata_init = NULL; return false; } /* === DebugFS Interface ==================================================== */ static int stats_show(struct seq_file *seq, void *v) { int i; seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); for (i = 0; i < KFENCE_COUNTER_COUNT; i++) seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); return 0; } DEFINE_SHOW_ATTRIBUTE(stats); /* * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. * start_object() and next_object() return the object index + 1, because NULL is used * to stop iteration. */ static void *start_object(struct seq_file *seq, loff_t *pos) { if (*pos < CONFIG_KFENCE_NUM_OBJECTS) return (void *)((long)*pos + 1); return NULL; } static void stop_object(struct seq_file *seq, void *v) { } static void *next_object(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; if (*pos < CONFIG_KFENCE_NUM_OBJECTS) return (void *)((long)*pos + 1); return NULL; } static int show_object(struct seq_file *seq, void *v) { struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; unsigned long flags; raw_spin_lock_irqsave(&meta->lock, flags); kfence_print_object(seq, meta); raw_spin_unlock_irqrestore(&meta->lock, flags); seq_puts(seq, "---------------------------------\n"); return 0; } static const struct seq_operations objects_sops = { .start = start_object, .next = next_object, .stop = stop_object, .show = show_object, }; DEFINE_SEQ_ATTRIBUTE(objects); static int kfence_debugfs_init(void) { struct dentry *kfence_dir; if (!READ_ONCE(kfence_enabled)) return 0; kfence_dir = debugfs_create_dir("kfence", NULL); debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); return 0; } late_initcall(kfence_debugfs_init); /* === Panic Notifier ====================================================== */ static void kfence_check_all_canary(void) { int i; for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata[i]; if (kfence_obj_allocated(meta)) check_canary(meta); } } static int kfence_check_canary_callback(struct notifier_block *nb, unsigned long reason, void *arg) { kfence_check_all_canary(); return NOTIFY_OK; } static struct notifier_block kfence_check_canary_notifier = { .notifier_call = kfence_check_canary_callback, }; /* === Allocation Gate Timer ================================================ */ static struct delayed_work kfence_timer; #ifdef CONFIG_KFENCE_STATIC_KEYS /* Wait queue to wake up allocation-gate timer task. */ static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); static void wake_up_kfence_timer(struct irq_work *work) { wake_up(&allocation_wait); } static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); #endif /* * Set up delayed work, which will enable and disable the static key. We need to * use a work queue (rather than a simple timer), since enabling and disabling a * static key cannot be done from an interrupt. * * Note: Toggling a static branch currently causes IPIs, and here we'll end up * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with * more aggressive sampling intervals), we could get away with a variant that * avoids IPIs, at the cost of not immediately capturing allocations if the * instructions remain cached. */ static void toggle_allocation_gate(struct work_struct *work) { if (!READ_ONCE(kfence_enabled)) return; atomic_set(&kfence_allocation_gate, -kfence_burst); #ifdef CONFIG_KFENCE_STATIC_KEYS /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif queue_delayed_work(system_unbound_wq, &kfence_timer, msecs_to_jiffies(kfence_sample_interval)); } /* === Public interface ===================================================== */ void __init kfence_alloc_pool_and_metadata(void) { if (!kfence_sample_interval) return; /* * If the pool has already been initialized by arch, there is no need to * re-allocate the memory pool. */ if (!__kfence_pool) __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!__kfence_pool) { pr_err("failed to allocate pool\n"); return; } /* The memory allocated by memblock has been zeroed out. */ kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE); if (!kfence_metadata_init) { pr_err("failed to allocate metadata\n"); memblock_free(__kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = NULL; } } static void kfence_init_enable(void) { if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) static_branch_enable(&kfence_allocation_key); if (kfence_deferrable) INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate); else INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate); if (kfence_check_on_panic) atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); } void __init kfence_init(void) { stack_hash_seed = get_random_u32(); /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ if (!kfence_sample_interval) return; if (!kfence_init_pool_early()) { pr_err("%s failed\n", __func__); return; } kfence_init_enable(); } static int kfence_init_late(void) { const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE; const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE; unsigned long addr = (unsigned long)__kfence_pool; unsigned long free_size = KFENCE_POOL_SIZE; int err = -ENOMEM; #ifdef CONFIG_CONTIG_ALLOC struct page *pages; pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node, NULL); if (!pages) return -ENOMEM; __kfence_pool = page_to_virt(pages); pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node, NULL); if (pages) kfence_metadata_init = page_to_virt(pages); #else if (nr_pages_pool > MAX_ORDER_NR_PAGES || nr_pages_meta > MAX_ORDER_NR_PAGES) { pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); return -EINVAL; } __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); if (!__kfence_pool) return -ENOMEM; kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL); #endif if (!kfence_metadata_init) goto free_pool; memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE); addr = kfence_init_pool(); if (!addr) { kfence_init_enable(); kfence_debugfs_init(); return 0; } pr_err("%s failed\n", __func__); free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); err = -EBUSY; #ifdef CONFIG_CONTIG_ALLOC free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)), nr_pages_meta); free_pool: free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE); #else free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE); free_pool: free_pages_exact((void *)addr, free_size); #endif kfence_metadata_init = NULL; __kfence_pool = NULL; return err; } static int kfence_enable_late(void) { if (!__kfence_pool) return kfence_init_late(); WRITE_ONCE(kfence_enabled, true); queue_delayed_work(system_unbound_wq, &kfence_timer, 0); pr_info("re-enabled\n"); return 0; } void kfence_shutdown_cache(struct kmem_cache *s) { unsigned long flags; struct kfence_metadata *meta; int i; /* Pairs with release in kfence_init_pool(). */ if (!smp_load_acquire(&kfence_metadata)) return; for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { bool in_use; meta = &kfence_metadata[i]; /* * If we observe some inconsistent cache and state pair where we * should have returned false here, cache destruction is racing * with either kmem_cache_alloc() or kmem_cache_free(). Taking * the lock will not help, as different critical section * serialization will have the same outcome. */ if (READ_ONCE(meta->cache) != s || !kfence_obj_allocated(meta)) continue; raw_spin_lock_irqsave(&meta->lock, flags); in_use = meta->cache == s && kfence_obj_allocated(meta); raw_spin_unlock_irqrestore(&meta->lock, flags); if (in_use) { /* * This cache still has allocations, and we should not * release them back into the freelist so they can still * safely be used and retain the kernel's default * behaviour of keeping the allocations alive (leak the * cache); however, they effectively become "zombie * allocations" as the KFENCE objects are the only ones * still in use and the owning cache is being destroyed. * * We mark them freed, so that any subsequent use shows * more useful error messages that will include stack * traces of the user of the object, the original * allocation, and caller to shutdown_cache(). */ kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); } } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { meta = &kfence_metadata[i]; /* See above. */ if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) continue; raw_spin_lock_irqsave(&meta->lock, flags); if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) meta->cache = NULL; raw_spin_unlock_irqrestore(&meta->lock, flags); } } void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { unsigned long stack_entries[KFENCE_STACK_DEPTH]; size_t num_stack_entries; u32 alloc_stack_hash; int allocation_gate; /* * Perform size check before switching kfence_allocation_gate, so that * we don't disable KFENCE without making an allocation. */ if (size > PAGE_SIZE) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; } /* * Skip allocations from non-default zones, including DMA. We cannot * guarantee that pages in the KFENCE pool will have the requested * properties (e.g. reside in DMAable memory). */ if ((flags & GFP_ZONEMASK) || (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); return NULL; } /* * Skip allocations for this slab, if KFENCE has been disabled for * this slab. */ if (s->flags & SLAB_SKIP_KFENCE) return NULL; allocation_gate = atomic_inc_return(&kfence_allocation_gate); if (allocation_gate > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS /* * waitqueue_active() is fully ordered after the update of * kfence_allocation_gate per atomic_inc_return(). */ if (allocation_gate == 1 && waitqueue_active(&allocation_wait)) { /* * Calling wake_up() here may deadlock when allocations happen * from within timer code. Use an irq_work to defer it. */ irq_work_queue(&wake_up_kfence_timer_work); } #endif if (!READ_ONCE(kfence_enabled)) return NULL; num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); /* * Do expensive check for coverage of allocation in slow-path after * allocation_gate has already become non-zero, even though it might * mean not making any allocation within a given sample interval. * * This ensures reasonable allocation coverage when the pool is almost * full, including avoiding long-lived allocations of the same source * filling up the pool (e.g. pagecache allocations). */ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries); if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) { atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]); return NULL; } return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries, alloc_stack_hash); } size_t kfence_ksize(const void *addr) { const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); /* * Read locklessly -- if there is a race with __kfence_alloc(), this is * either a use-after-free or invalid access. */ return meta ? meta->size : 0; } void *kfence_object_start(const void *addr) { const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); /* * Read locklessly -- if there is a race with __kfence_alloc(), this is * either a use-after-free or invalid access. */ return meta ? (void *)meta->addr : NULL; } void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); #ifdef CONFIG_MEMCG KFENCE_WARN_ON(meta->obj_exts.objcg); #endif /* * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing * the object, as the object page may be recycled for other-typed * objects once it has been freed. meta->cache may be NULL if the cache * was destroyed. * Save the stack trace here so that reports show where the user freed * the object. */ if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) { unsigned long flags; raw_spin_lock_irqsave(&meta->lock, flags); metadata_update_state(meta, KFENCE_OBJECT_RCU_FREEING, NULL, 0); raw_spin_unlock_irqrestore(&meta->lock, flags); call_rcu(&meta->rcu_head, rcu_guarded_free); } else { kfence_guarded_free(addr, meta, false); } } bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) { const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; struct kfence_metadata *to_report = NULL; enum kfence_error_type error_type; unsigned long flags; if (!is_kfence_address((void *)addr)) return false; if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ return kfence_unprotect(addr); /* ... unprotect and proceed. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); if (page_index % 2) { /* This is a redzone, report a buffer overflow. */ struct kfence_metadata *meta; int distance = 0; meta = addr_to_metadata(addr - PAGE_SIZE); if (meta && kfence_obj_allocated(meta)) { to_report = meta; /* Data race ok; distance calculation approximate. */ distance = addr - data_race(meta->addr + meta->size); } meta = addr_to_metadata(addr + PAGE_SIZE); if (meta && kfence_obj_allocated(meta)) { /* Data race ok; distance calculation approximate. */ if (!to_report || distance > data_race(meta->addr) - addr) to_report = meta; } if (!to_report) goto out; raw_spin_lock_irqsave(&to_report->lock, flags); to_report->unprotected_page = addr; error_type = KFENCE_ERROR_OOB; /* * If the object was freed before we took the look we can still * report this as an OOB -- the report will simply show the * stacktrace of the free as well. */ } else { to_report = addr_to_metadata(addr); if (!to_report) goto out; raw_spin_lock_irqsave(&to_report->lock, flags); error_type = KFENCE_ERROR_UAF; /* * We may race with __kfence_alloc(), and it is possible that a * freed object may be reallocated. We simply report this as a * use-after-free, with the stack trace showing the place where * the object was re-allocated. */ } out: if (to_report) { kfence_report_error(addr, is_write, regs, to_report, error_type); raw_spin_unlock_irqrestore(&to_report->lock, flags); } else { /* This may be a UAF or OOB access, but we can't be sure. */ kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); } return kfence_unprotect(addr); /* Unprotect and let access proceed. */ } |
9122 9115 9122 9112 9188 9119 9123 9113 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> #include <linux/pgalloc_tag.h> /* * struct page extension * * This is the feature to manage memory for extended data per page. * * Until now, we must modify struct page itself to store extra data per page. * This requires rebuilding the kernel and it is really time consuming process. * And, sometimes, rebuild is impossible due to third party module dependency. * At last, enlarging struct page could cause un-wanted system behaviour change. * * This feature is intended to overcome above mentioned problems. This feature * allocates memory for extended data per page in certain place rather than * the struct page itself. This memory can be accessed by the accessor * functions provided by this code. During the boot process, it checks whether * allocation of huge chunk of memory is needed or not. If not, it avoids * allocating memory at all. With this advantage, we can include this feature * into the kernel in default and can avoid rebuild and solve related problems. * * To help these things to work well, there are two callbacks for clients. One * is the need callback which is mandatory if user wants to avoid useless * memory allocation at boot-time. The other is optional, init callback, which * is used to do proper initialization after memory is allocated. * * The need callback is used to decide whether extended memory allocation is * needed or not. Sometimes users want to deactivate some features in this * boot and extra memory would be unnecessary. In this case, to avoid * allocating huge chunk of memory, each clients represent their need of * extra memory through the need callback. If one of the need callbacks * returns true, it means that someone needs extra memory so that * page extension core should allocates memory for page extension. If * none of need callbacks return true, memory isn't needed at all in this boot * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * * When need callback returns true, page_ext checks if there is a request for * extra memory through size in struct page_ext_operations. If it is non-zero, * extra space is allocated for each page_ext entry and offset is returned to * user through offset in struct page_ext_operations. * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime * of memory for page extension isn't same with memmap for struct page. * Therefore, clients can't store extra data until page extension is * initialized, even if pages are allocated and used freely. This could * cause inadequate state of extra data per page, so, to prevent it, client * can utilize this callback to initialize the state of it correctly. */ #ifdef CONFIG_SPARSEMEM #define PAGE_EXT_INVALID (0x1) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { return true; } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, .need_shared_flags = true, }; #endif static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif #ifdef CONFIG_MEM_ALLOC_PROFILING &page_alloc_tagging_ops, #endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif }; unsigned long page_ext_size; static unsigned long total_usage; #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG /* * To ensure correct allocation tagging for pages, page_ext should be available * before the first page allocation. Otherwise early task stacks will be * allocated before page_ext initialization and missing tags will be flagged. */ bool early_page_ext __meminitdata = true; #else bool early_page_ext __meminitdata; #endif static int __init setup_early_page_ext(char *str) { early_page_ext = true; return 0; } early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); bool need = false; for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { if (page_ext_ops[i]->need_shared_flags) { page_ext_size = sizeof(struct page_ext); break; } } } for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; } } return need; } static void __init invoke_init_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); for (i = 0; i < entries; i++) { if (page_ext_ops[i]->init) page_ext_ops[i]->init(); } } static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } #ifndef CONFIG_SPARSEMEM void __init page_ext_init_flatmem_late(void) { invoke_init_callbacks(); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { pgdat->node_page_ext = NULL; } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (unlikely(!base)) return NULL; index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) { struct page_ext *base; unsigned long table_size; unsigned long nr_pages; nr_pages = NODE_DATA(nid)->node_spanned_pages; if (!nr_pages) return 0; /* * Need extra space if node range is not aligned with * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm * checks buddy's status, range could be out of exact node range. */ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; table_size = page_ext_size * nr_pages; base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; memmap_boot_pages_add(DIV_ROUND_UP(table_size, PAGE_SIZE)); return 0; } void __init page_ext_init_flatmem(void) { int nid, fail; if (!invoke_need_callbacks()) return; for_each_online_node(nid) { fail = alloc_node_page_ext(nid); if (fail) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); return; fail: pr_crit("allocation of page_ext failed.\n"); panic("Out of memory"); } #else /* CONFIG_SPARSEMEM */ static bool page_ext_invalid(struct page_ext *page_ext) { return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); struct page_ext *page_ext = READ_ONCE(section->page_ext); WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (page_ext_invalid(page_ext)) return NULL; return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) { gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) kmemleak_alloc(addr, size, 1, flags); else addr = vzalloc_node(size, nid); if (addr) memmap_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); return addr; } static int __meminit init_section_page_ext(unsigned long pfn, int nid) { struct mem_section *section; struct page_ext *base; unsigned long table_size; section = __pfn_to_section(pfn); if (section->page_ext) return 0; table_size = page_ext_size * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* * The value stored in section->page_ext is (base - pfn) * and it does not point to the memory block allocated above, * causing kmemleak false positives. */ kmemleak_not_leak(base); if (!base) { pr_err("page ext allocation failure\n"); return -ENOMEM; } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; section->page_ext = (void *)base - page_ext_size * pfn; total_usage += table_size; return 0; } static void free_page_ext(void *addr) { size_t table_size; struct page *page; table_size = page_ext_size * PAGES_PER_SECTION; memmap_pages_add(-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE))); if (is_vmalloc_addr(addr)) { vfree(addr); } else { page = virt_to_page(addr); BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } } static void __free_page_ext(unsigned long pfn) { struct mem_section *ms; struct page_ext *base; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; base = READ_ONCE(ms->page_ext); /* * page_ext here can be valid while doing the roll back * operation in online_page_ext(). */ if (page_ext_invalid(base)) base = (void *)base - PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, NULL); base = get_entry(base, pfn); free_page_ext(base); } static void __invalidate_page_ext(unsigned long pfn) { struct mem_section *ms; void *val; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; val = (void *)ms->page_ext + PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, unsigned long nr_pages, int nid) { unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for * online__pages(), and start_pfn should exist. */ nid = pfn_to_nid(start_pfn); VM_BUG_ON(!node_online(nid)); } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) return 0; /* rollback */ end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return -ENOMEM; } static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); /* * Freeing of page_ext is done in 3 steps to avoid * use-after-free of it: * 1) Traverse all the sections and mark their page_ext * as invalid. * 2) Wait for all the existing users of page_ext who * started before invalidation to finish. * 3) Free the page_ext. */ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __invalidate_page_ext(pfn); synchronize_rcu(); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); } static int __meminit page_ext_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; int ret = 0; switch (action) { case MEM_GOING_ONLINE: ret = online_page_ext(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: case MEM_CANCEL_OFFLINE: break; } return notifier_from_errno(ret); } void __init page_ext_init(void) { unsigned long pfn; int nid; if (!invoke_need_callbacks()) return; for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); end_pfn = node_end_pfn(nid); /* * start_pfn and end_pfn may not be aligned to SECTION and the * page->flags of out of node pages are not initialized. So we * scan [start_pfn, the biggest section's pfn < end_pfn) here. */ for (pfn = start_pfn; pfn < end_pfn; pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { if (!pfn_valid(pfn)) continue; /* * Nodes's pfns can be overlapping. * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... */ if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; cond_resched(); } } hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; oom: panic("Out of memory"); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { } #endif /** * page_ext_get() - Get the extended information for a page. * @page: The page we're interested in. * * Ensures that the page_ext will remain valid until page_ext_put() * is called. * * Return: NULL if no page_ext exists for this page. * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ struct page_ext *page_ext_get(const struct page *page) { struct page_ext *page_ext; rcu_read_lock(); page_ext = lookup_page_ext(page); if (!page_ext) { rcu_read_unlock(); return NULL; } return page_ext; } /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). * * The page extended information of the page may not be valid after this * function is called. * * Return: None. * Context: Any context with corresponding page_ext_get() is called. */ void page_ext_put(struct page_ext *page_ext) { if (unlikely(!page_ext)) return; rcu_read_unlock(); } |
8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/lockd/svc.c * * This is the central lockd service. * * FIXME: Separate the lockd NFS server functionality from the lockd NFS * client functionality. Oh why didn't Sun create two separate * services in the first place? * * Authors: Olaf Kirch (okir@monad.swb.de) * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/moduleparam.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/uio.h> #include <linux/smp.h> #include <linux/mutex.h> #include <linux/freezer.h> #include <linux/inetdevice.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/svc_xprt.h> #include <net/ip.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> #include "netns.h" #include "procfs.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) static struct svc_program nlmsvc_program; const struct nlmsvc_binding *nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; static void nlmsvc_request_retry(struct timer_list *tl) { svc_wake_up(nlmsvc_serv); } DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry); unsigned int lockd_net_id; /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ static unsigned long nlm_grace_period; unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; /* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ static unsigned int nlm_max_connections = 1024; /* * Constants needed for the sysctl interface. */ static const unsigned long nlm_grace_period_min = 0; static const unsigned long nlm_grace_period_max = 240; static const unsigned long nlm_timeout_min = 3; static const unsigned long nlm_timeout_max = 20; #ifdef CONFIG_SYSCTL static const int nlm_port_min = 0, nlm_port_max = 65535; static struct ctl_table_header * nlm_sysctl_table; #endif static unsigned long get_lockd_grace_period(void) { /* Note: nlm_timeout should always be nonzero */ if (nlm_grace_period) return roundup(nlm_grace_period, nlm_timeout) * HZ; else return nlm_timeout * 5 * HZ; } static void grace_ender(struct work_struct *grace) { struct delayed_work *dwork = to_delayed_work(grace); struct lockd_net *ln = container_of(dwork, struct lockd_net, grace_period_end); locks_end_grace(&ln->lockd_manager); } static void set_grace_period(struct net *net) { unsigned long grace_period = get_lockd_grace_period(); struct lockd_net *ln = net_generic(net, lockd_net_id); locks_start_grace(net, &ln->lockd_manager); cancel_delayed_work_sync(&ln->grace_period_end); schedule_delayed_work(&ln->grace_period_end, grace_period); } /* * This is the lockd kernel thread */ static int lockd(void *vrqstp) { struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); svc_thread_init_status(rqstp, 0); /* try_to_freeze() is called from svc_recv() */ set_freezable(); dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. */ while (!svc_thread_should_stop(rqstp)) { /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; nlmsvc_retry_blocked(rqstp); svc_recv(rqstp); } if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); dprintk("lockd_down: service stopped\n"); svc_exit_thread(rqstp); return 0; } static int create_lockd_listener(struct svc_serv *serv, const char *name, struct net *net, const int family, const unsigned short port, const struct cred *cred) { struct svc_xprt *xprt; xprt = svc_find_xprt(serv, name, net, family, 0); if (xprt == NULL) return svc_xprt_create(serv, name, net, family, port, SVC_SOCK_DEFAULTS, cred); svc_xprt_put(xprt); return 0; } static int create_lockd_family(struct svc_serv *serv, struct net *net, const int family, const struct cred *cred) { int err; err = create_lockd_listener(serv, "udp", net, family, nlm_udpport, cred); if (err < 0) return err; return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport, cred); } /* * Ensure there are active UDP and TCP listeners for lockd. * * Even if we have only TCP NFS mounts and/or TCP NFSDs, some * local services (such as rpc.statd) still require UDP, and * some NFS servers do not yet support NLM over TCP. * * Returns zero if all listeners are available; otherwise a * negative errno value is returned. */ static int make_socks(struct svc_serv *serv, struct net *net, const struct cred *cred) { static int warned; int err; err = create_lockd_family(serv, net, PF_INET, cred); if (err < 0) goto out_err; err = create_lockd_family(serv, net, PF_INET6, cred); if (err < 0 && err != -EAFNOSUPPORT) goto out_err; warned = 0; return 0; out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); svc_xprt_destroy_all(serv, net); svc_rpcb_cleanup(serv, net); return err; } static int lockd_up_net(struct svc_serv *serv, struct net *net, const struct cred *cred) { struct lockd_net *ln = net_generic(net, lockd_net_id); int error; if (ln->nlmsvc_users++) return 0; error = svc_bind(serv, net); if (error) goto err_bind; error = make_socks(serv, net, cred); if (error < 0) goto err_bind; set_grace_period(net); dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum); return 0; err_bind: ln->nlmsvc_users--; return error; } static void lockd_down_net(struct svc_serv *serv, struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); svc_xprt_destroy_all(serv, net); svc_rpcb_cleanup(serv, net); } } else { pr_err("%s: no users! net=%x\n", __func__, net->ns.inum); BUG(); } } static int lockd_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); sin.sin_family = AF_INET; sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inetaddr_notifier = { .notifier_call = lockd_inetaddr_event, }; #if IS_ENABLED(CONFIG_IPV6) static int lockd_inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; if (event != NETDEV_DOWN) goto out; if (nlmsvc_serv) { dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); sin6.sin6_family = AF_INET6; sin6.sin6_addr = ifa->addr; if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL) sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6); } out: return NOTIFY_DONE; } static struct notifier_block lockd_inet6addr_notifier = { .notifier_call = lockd_inet6addr_event, }; #endif static int lockd_get(void) { struct svc_serv *serv; int error; if (nlmsvc_serv) { nlmsvc_users++; return 0; } /* * Sanity check: if there's no pid, * we should be the first user ... */ if (nlmsvc_users) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return -ENOMEM; } serv->sv_maxconn = nlm_max_connections; error = svc_set_num_threads(serv, NULL, 1); if (error < 0) { svc_destroy(&serv); return error; } nlmsvc_serv = serv; register_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) register_inet6addr_notifier(&lockd_inet6addr_notifier); #endif dprintk("lockd_up: service created\n"); nlmsvc_users++; return 0; } static void lockd_put(void) { if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n")) return; if (--nlmsvc_users) return; unregister_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif svc_set_num_threads(nlmsvc_serv, NULL, 0); timer_delete_sync(&nlmsvc_retry); svc_destroy(&nlmsvc_serv); dprintk("lockd_down: service destroyed\n"); } /* * Bring up the lockd process if it's not already up. */ int lockd_up(struct net *net, const struct cred *cred) { int error; mutex_lock(&nlmsvc_mutex); error = lockd_get(); if (error) goto err; error = lockd_up_net(nlmsvc_serv, net, cred); if (error < 0) { lockd_put(); goto err; } err: mutex_unlock(&nlmsvc_mutex); return error; } EXPORT_SYMBOL_GPL(lockd_up); /* * Decrement the user count and bring down lockd if we're the last. */ void lockd_down(struct net *net) { mutex_lock(&nlmsvc_mutex); lockd_down_net(nlmsvc_serv, net); lockd_put(); mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL_GPL(lockd_down); #ifdef CONFIG_SYSCTL /* * Sysctl parameters (same as module parameters, different interface). */ static struct ctl_table nlm_sysctls[] = { { .procname = "nlm_grace_period", .data = &nlm_grace_period, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_grace_period_min, .extra2 = (unsigned long *) &nlm_grace_period_max, }, { .procname = "nlm_timeout", .data = &nlm_timeout, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_timeout_min, .extra2 = (unsigned long *) &nlm_timeout_max, }, { .procname = "nlm_udpport", .data = &nlm_udpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nlm_tcpport", .data = &nlm_tcpport, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, { .procname = "nsm_use_hostnames", .data = &nsm_use_hostnames, .maxlen = sizeof(bool), .mode = 0644, .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", .data = &nsm_local_state, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; #endif /* CONFIG_SYSCTL */ /* * Module (and sysfs) parameters. */ #define param_set_min_max(name, type, which_strtol, min, max) \ static int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ char *endp; \ __typeof__(type) num = which_strtol(val, &endp, 0); \ if (endp == val || *endp || num < (min) || num > (max)) \ return -EINVAL; \ *((type *) kp->arg) = num; \ return 0; \ } static inline int is_callback(u32 proc) { return proc == NLMPROC_GRANTED || proc == NLMPROC_GRANTED_MSG || proc == NLMPROC_TEST_RES || proc == NLMPROC_LOCK_RES || proc == NLMPROC_CANCEL_RES || proc == NLMPROC_UNLOCK_RES || proc == NLMPROC_NSM_NOTIFY; } static enum svc_auth_status lockd_authenticate(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: rqstp->rq_auth_stat = rpc_auth_ok; if (rqstp->rq_proc == 0) return SVC_OK; if (is_callback(rqstp->rq_proc)) { /* Leave it to individual procedures to * call nlmsvc_lookup_host(rqstp) */ return SVC_OK; } return svc_set_client(rqstp); } rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } param_set_min_max(port, int, simple_strtol, 0, 65535) param_set_min_max(grace_period, unsigned long, simple_strtoul, nlm_grace_period_min, nlm_grace_period_max) param_set_min_max(timeout, unsigned long, simple_strtoul, nlm_timeout_min, nlm_timeout_max) MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_DESCRIPTION("NFS file locking service version " LOCKD_VERSION "."); MODULE_LICENSE("GPL"); module_param_call(nlm_grace_period, param_set_grace_period, param_get_ulong, &nlm_grace_period, 0644); module_param_call(nlm_timeout, param_set_timeout, param_get_ulong, &nlm_timeout, 0644); module_param_call(nlm_udpport, param_set_port, param_get_int, &nlm_udpport, 0644); module_param_call(nlm_tcpport, param_set_port, param_get_int, &nlm_tcpport, 0644); module_param(nsm_use_hostnames, bool, 0644); module_param(nlm_max_connections, uint, 0644); static int lockd_init_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); INIT_LIST_HEAD(&ln->lockd_manager.list); ln->lockd_manager.block_opens = false; INIT_LIST_HEAD(&ln->nsm_handles); return 0; } static void lockd_exit_net(struct net *net) { struct lockd_net *ln = net_generic(net, lockd_net_id); WARN_ONCE(!list_empty(&ln->lockd_manager.list), "net %x %s: lockd_manager.list is not empty\n", net->ns.inum, __func__); WARN_ONCE(!list_empty(&ln->nsm_handles), "net %x %s: nsm_handles list is not empty\n", net->ns.inum, __func__); WARN_ONCE(delayed_work_pending(&ln->grace_period_end), "net %x %s: grace_period_end was not cancelled\n", net->ns.inum, __func__); } static struct pernet_operations lockd_net_ops = { .init = lockd_init_net, .exit = lockd_exit_net, .id = &lockd_net_id, .size = sizeof(struct lockd_net), }; /* * Initialising and terminating the module. */ static int __init init_nlm(void) { int err; #ifdef CONFIG_SYSCTL err = -ENOMEM; nlm_sysctl_table = register_sysctl("fs/nfs", nlm_sysctls); if (nlm_sysctl_table == NULL) goto err_sysctl; #endif err = register_pernet_subsys(&lockd_net_ops); if (err) goto err_pernet; err = lockd_create_procfs(); if (err) goto err_procfs; return 0; err_procfs: unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); err_sysctl: #endif return err; } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); #endif } module_init(init_nlm); module_exit(exit_nlm); /** * nlmsvc_dispatch - Process an NLM Request * @rqstp: incoming request * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ static int nlmsvc_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; *statp = procp->pc_func(rqstp); if (*statp == rpc_drop_reply) return 0; if (*statp != rpc_success) return 1; if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; return 1; out_decode_err: *statp = rpc_garbage_args; return 1; out_encode_err: *statp = rpc_system_err; return 1; } /* * Define NLM program and procedures */ static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]); static const struct svc_version nlmsvc_version1 = { .vs_vers = 1, .vs_nproc = 17, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version1_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]); static const struct svc_version nlmsvc_version3 = { .vs_vers = 3, .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #ifdef CONFIG_LOCKD_V4 static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]); static const struct svc_version nlmsvc_version4 = { .vs_vers = 4, .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4), .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, #ifdef CONFIG_LOCKD_V4 [4] = &nlmsvc_version4, #endif }; #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ .pg_nvers = NLM_NRVERS, /* number of entries in nlmsvc_version */ .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, }; |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IF_MACVLAN_H #define _LINUX_IF_MACVLAN_H #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/netlink.h> #include <linux/u64_stats_sync.h> struct macvlan_port; #define MACVLAN_MC_FILTER_BITS 8 #define MACVLAN_MC_FILTER_SZ (1 << MACVLAN_MC_FILTER_BITS) struct macvlan_dev { struct net_device *dev; struct list_head list; struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; netdevice_tracker dev_tracker; void *accel_priv; struct vlan_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); netdev_features_t set_features; enum macvlan_mode mode; u16 flags; unsigned int macaddr_count; u32 bc_queue_len_req; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, unsigned int len, bool success, bool multicast) { if (likely(success)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = get_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->rx_packets); u64_stats_add(&pcpu_stats->rx_bytes, len); if (multicast) u64_stats_inc(&pcpu_stats->rx_multicast); u64_stats_update_end(&pcpu_stats->syncp); put_cpu_ptr(vlan->pcpu_stats); } else { this_cpu_inc(vlan->pcpu_stats->rx_errors); } } extern void macvlan_common_setup(struct net_device *dev); extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); extern void macvlan_dellink(struct net_device *dev, struct list_head *head); extern int macvlan_link_register(struct rtnl_link_ops *ops); #if IS_ENABLED(CONFIG_MACVLAN) static inline struct net_device * macvlan_dev_real_dev(const struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->lowerdev; } #else static inline struct net_device * macvlan_dev_real_dev(const struct net_device *dev) { BUG(); return NULL; } #endif static inline void *macvlan_accel_priv(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->accel_priv; } static inline bool macvlan_supports_dest_filter(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->mode == MACVLAN_MODE_PRIVATE || macvlan->mode == MACVLAN_MODE_VEPA || macvlan->mode == MACVLAN_MODE_BRIDGE; } static inline int macvlan_release_l2fw_offload(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); macvlan->accel_priv = NULL; return dev_uc_add(macvlan->lowerdev, dev->dev_addr); } #endif /* _LINUX_IF_MACVLAN_H */ |
48 40 11 11 11 11 11 11 2 9 9 8 11 11 11 11 9 2 11 11 11 39 38 39 39 39 38 38 38 39 34 5 39 4 4 5 5 5 5 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 2 3 3 420 422 3 2 2 3 131 128 106 35 128 54 6 5 2 6 51 3 3 3 3 3 12 32 19 20 20 18 20 21 20 20 9 9 9 21 12 12 12 12 2 11 18 18 17 18 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 313 310 2 384 383 2 3 312 5 5 10 305 385 385 86 314 313 313 313 247 314 314 386 16 2 15 15 15 15 1 15 15 1 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/filemap.c * * Copyright (C) 1994-1999 Linus Torvalds */ /* * This file handles the generic file mmap semantics used by * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ #include <linux/export.h> #include <linux/compiler.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/error-injection.h> #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include <linux/delayacct.h> #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/filemap.h> /* * FIXME: remove all knowledge of the buffer layer from the core VM */ #include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> #include "swap.h" /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. * * Shared mappings now work. 15.8.1995 Bruno. * * finished 'unifying' the page and buffer cache and SMP-threaded the * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> * * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> */ /* * Lock ordering: * * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->block_dirty_folio) * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * * ->i_rwsem * ->invalidate_lock (acquired by fs in truncate path) * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_lock * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_lock * ->invalidate_lock (filemap_fault) * ->lock_page (filemap_fault, access_process_vm) * * ->i_rwsem (generic_perform_write) * ->mmap_lock (fault_in_readable->do_page_fault) * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) * * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ static void mapping_set_update(struct xa_state *xas, struct address_space *mapping) { if (dax_mapping(mapping) || shmem_mapping(mapping)) return; xas_set_update(xas, workingset_update_node); xas_set_lru(xas, &shadow_nodes); } static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { XA_STATE(xas, &mapping->i_pages, folio->index); long nr = 1; mapping_set_update(&xas, mapping); xas_set_order(&xas, folio->index, folio_order(folio)); nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); folio->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } static void filemap_unaccount_folio(struct address_space *mapping, struct folio *folio) { long nr; VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", current->comm, folio_pfn(folio)); dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's * a good bet that actually the page is unmapped * and we'd rather not leak it: if we're wrong, * another bad page check should catch it later. */ atomic_set(&folio->_mapcount, -1); folio_ref_sub(folio, mapcount); } } } /* hugetlb folios do not participate in page cache accounting. */ if (folio_test_hugetlb(folio)) return; nr = folio_nr_pages(folio); __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of * unwritten data - on ordinary filesystems. * * But it's harmless on in-memory filesystems like tmpfs; and can * occur when a driver which did get_user_pages() sets page dirty * before putting it, while the inode is being finally evicted. * * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ if (WARN_ON_ONCE(folio_test_dirty(folio) && mapping_can_writeback(mapping))) folio_account_cleaned(folio, inode_to_wb(mapping->host)); } /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ void __filemap_remove_folio(struct folio *folio, void *shadow) { struct address_space *mapping = folio->mapping; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); page_cache_delete(mapping, folio, shadow); } void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); int refs = 1; free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); if (folio_test_large(folio)) refs = folio_nr_pages(folio); folio_put_refs(folio, refs); } /** * filemap_remove_folio - Remove folio from page cache. * @folio: The folio. * * This must be called only on folios that are locked and have been * verified to be in the page cache. It will never put the folio into * the free list because the caller has a reference on the page. */ void filemap_remove_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); } /* * page_cache_delete_batch - delete several folios from page cache * @mapping: the mapping to which folios belong * @fbatch: batch of folios to delete * * The function walks over mapping->i_pages and removes folios passed in * @fbatch from the mapping. The function expects @fbatch to be sorted * by page index and is optimised for it to be dense. * It tolerates holes in @fbatch (mapping entries at those indices are not * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); long total_pages = 0; int i = 0; struct folio *folio; mapping_set_update(&xas, mapping); xas_for_each(&xas, folio, ULONG_MAX) { if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our * pages locked so they are protected from being removed. * If we see a page whose index is higher than ours, it * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ if (folio != fbatch->folios[i]) { VM_BUG_ON_FOLIO(folio->index > fbatch->folios[i]->index, folio); continue; } WARN_ON_ONCE(!folio_test_locked(folio)); folio->mapping = NULL; /* Leave folio->index set: truncation lookup relies on it */ i++; xas_store(&xas, NULL); total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch) { int i; if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); } page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ if (test_bit(AS_ENOSPC, &mapping->flags) && test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; if (test_bit(AS_EIO, &mapping->flags) && test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } EXPORT_SYMBOL(filemap_check_errors); static int filemap_check_and_keep_errors(struct address_space *mapping) { /* Check for outstanding write errors */ if (test_bit(AS_EIO, &mapping->flags)) return -EIO; if (test_bit(AS_ENOSPC, &mapping->flags)) return -ENOSPC; return 0; } /** * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @wbc: the writeback_control controlling the writeout * * Call writepages on the mapping using the provided wbc to control the * writeout. * * Return: %0 on success, negative error code otherwise. */ int filemap_fdatawrite_wbc(struct address_space *mapping, struct writeback_control *wbc) { int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(wbc, mapping->host); ret = do_writepages(mapping, wbc); wbc_detach_inode(wbc); return ret; } EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. * * Return: %0 on success, negative error code otherwise. */ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; return filemap_fdatawrite_wbc(mapping, &wbc); } static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) { return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); } int filemap_fdatawrite(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. * * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_NONE); } EXPORT_SYMBOL(filemap_flush); /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. * * Return: %true if at least one page exists in the specified range, * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; rcu_read_lock(); for (;;) { folio = xas_find(&xas, max); if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to * release the RCU lock anyway. It is enough to know that * there was a page here recently. */ break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); static void __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; struct folio_batch fbatch; unsigned nr_folios; folio_batch_init(&fbatch); while (index <= end) { unsigned i; nr_folios = filemap_get_folios_tag(mapping, &index, end, PAGECACHE_TAG_WRITEBACK, &fbatch); if (!nr_folios) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); } folio_batch_release(&fbatch); cond_resched(); } } /** * filemap_fdatawait_range - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space * in the given range and wait for all of them. Check error status of * the address space and return it. * * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait_range_keep_errors - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space in the * given range and wait for all of them. Unlike filemap_fdatawait_range(), * this function does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) */ int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); /** * file_fdatawait_range - wait for writeback to complete * @file: file pointing to address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the address space that file * refers to, in the given range and wait for all of them. Check error * status of the address space vs. the file->f_wb_err cursor and return it. * * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { struct address_space *mapping = file->f_mapping; __filemap_fdatawait_range(mapping, start_byte, end_byte); return file_check_and_advance_wb_err(file); } EXPORT_SYMBOL(file_fdatawait_range); /** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space * and wait for all of them. Unlike filemap_fdatawait(), this function * does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) * * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { return mapping->nrpages; } bool filemap_range_has_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) continue; if (folio_test_dirty(folio) || folio_test_locked(folio) || folio_test_writeback(folio)) break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { int err = 0, err2; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = filemap_check_errors(mapping); if (!err) err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } EXPORT_SYMBOL(__filemap_set_wb_err); /** * file_check_and_advance_wb_err - report wb error (if any) that was previously * and advance wb_err to current one * @file: struct file on which the error is being reported * * When userland calls fsync (or something like nfsd does the equivalent), we * want to report any writeback errors that occurred since the last fsync (or * since the file was opened if there haven't been any). * * Grab the wb_err from the mapping. If it matches what we have in the file, * then just quickly return 0. The file is all caught up. * * If it doesn't match, then take the mapping value, set the "seen" flag in * it and try to swap it into place. If it works, or another task beat us * to it with the new value, then update the f_wb_err and return the error * portion. The error at this point must be reported via proper channels * (a'la fsync, or NFS COMMIT operation, etc.). * * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. * * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { int err = 0; errseq_t old = READ_ONCE(file->f_wb_err); struct address_space *mapping = file->f_mapping; /* Locklessly handle the common case where nothing has changed */ if (errseq_check(&mapping->wb_err, old)) { /* Something changed, must use slow path */ spin_lock(&file->f_lock); old = file->f_wb_err; err = errseq_check_and_advance(&mapping->wb_err, &file->f_wb_err); trace_file_check_and_advance_wb_err(file, old); spin_unlock(&file->f_lock); } /* * We're mostly using this function as a drop in replacement for * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect * that the legacy code would have had on these flags. */ clear_bit(AS_EIO, &mapping->flags); clear_bit(AS_ENOSPC, &mapping->flags); return err; } EXPORT_SYMBOL(file_check_and_advance_wb_err); /** * file_write_and_wait_range - write out & wait on a file range * @file: file pointing to address_space with pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. * * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { int err = 0, err2; struct address_space *mapping = file->f_mapping; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = file_check_and_advance_wb_err(file); if (!err) err = err2; return err; } EXPORT_SYMBOL(file_write_and_wait_range); /** * replace_page_cache_folio - replace a pagecache folio with a new one * @old: folio to be replaced * @new: folio to replace with * * This function replaces a folio in the pagecache with a new one. On * success it acquires the pagecache reference for the new folio and * drops it for the old folio. Both the old and new folios must be * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ void replace_page_cache_folio(struct folio *old, struct folio *new) { struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); VM_BUG_ON_FOLIO(new->mapping, new); folio_get(new); new->mapping = mapping; new->index = offset; mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) __lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) __lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) __lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); folio_put(old); } EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); void *alloced_shadow = NULL; int alloced_order = 0; bool huge; long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); xas_set_order(&xas, index, folio_order(folio)); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = xas.xa_index; for (;;) { int order = -1, split_order = 0; void *entry, *old = NULL; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; if (!xa_is_value(entry)) { xas_set_err(&xas, -EEXIST); goto unlock; } /* * If a larger entry exists, * it will be the first and only entry iterated. */ if (order == -1) order = xas_get_order(&xas); } /* entry may have changed before we re-acquire the lock */ if (alloced_order && (old != alloced_shadow || order != alloced_order)) { xas_destroy(&xas); alloced_order = 0; } if (old) { if (order > 0 && order > folio_order(folio)) { /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); if (!alloced_order) { split_order = order; goto unlock; } xas_split(&xas, old, order); xas_reset(&xas); } if (shadowp) *shadowp = old; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; mapping->nrpages += nr; /* hugetlb pages do not participate in page cache accounting */ if (!huge) { __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } unlock: xas_unlock_irq(&xas); /* split needed, alloc here and retry. */ if (split_order) { xas_split_alloc(&xas, old, split_order, gfp); if (xas_error(&xas)) goto error; alloced_shadow = old; alloced_order = split_order; xas_reset(&xas); continue; } if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) goto error; trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; ret = mem_cgroup_charge(folio, NULL, gfp); if (ret) return ret; __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) { mem_cgroup_uncharge(folio); __folio_clear_locked(folio); } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like * any other repeatedly accessed folio. * The exception is folios getting rewritten; evicting other * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ WARN_ON_ONCE(folio_test_active(folio)); if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); } return ret; } EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; } return folio_alloc_noprof(gfp, order); } EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* * filemap_invalidate_lock_two - lock invalidate_lock for two mappings * * Lock exclusively invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to lock * @mapping2: the second mapping to lock */ void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1 > mapping2) swap(mapping1, mapping2); if (mapping1) down_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) down_write_nested(&mapping2->invalidate_lock, 1); } EXPORT_SYMBOL(filemap_invalidate_lock_two); /* * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings * * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to unlock * @mapping2: the second mapping to unlock */ void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1) up_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) up_write(&mapping2->invalidate_lock); } EXPORT_SYMBOL(filemap_invalidate_unlock_two); /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of * waitqueues where the bucket discipline is to maintain all * waiters on the same queue and wake all when any of the pages * become available, and for the woken contexts to check to be * sure the appropriate page became available, this saves space * at a cost of "thundering herd" phenomena during rare hash * collisions. */ #define PAGE_WAIT_TABLE_BITS 8 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; static wait_queue_head_t *folio_waitqueue(struct folio *folio) { return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } void __init pagecache_init(void) { int i; for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); } /* * The page wait code treats the "wait->flags" somewhat unusually, because * we have multiple different kinds of waits, not just the usual "exclusive" * one. * * We have: * * (a) no special bits set: * * We're just waiting for the bit to be released, and when a waker * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, * and remove it from the wait queue. * * Simple and straightforward. * * (b) WQ_FLAG_EXCLUSIVE: * * The waiter is waiting to get the lock, and only one waiter should * be woken up to avoid any thundering herd behavior. We'll set the * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. * * This is the traditional exclusive wait. * * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: * * The waiter is waiting to get the bit, and additionally wants the * lock to be transferred to it for fair lock behavior. If the lock * cannot be taken, we stop walking the wait queue without waking * the waiter. * * This is the "fair lock handoff" case, and in addition to setting * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see * that it now has the lock. */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); if (!wake_page_match(wait_page, key)) return 0; /* * If it's a lock handoff wait, we get the bit for it, and * stop walking (and do not wake it up) if we can't. */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { if (test_bit(key->bit_nr, &key->folio->flags)) return -1; if (flags & WQ_FLAG_CUSTOM) { if (test_and_set_bit(key->bit_nr, &key->folio->flags)) return -1; flags |= WQ_FLAG_DONE; } } /* * We are holding the wait-queue lock, but the waiter that * is waiting for this will be checking the flags without * any locking. * * So update the flags atomically, and wake up the waiter * afterwards to avoid any races. This store-release pairs * with the load-acquire in folio_wait_bit_common(). */ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * * Note that this pairs with the "finish_wait()" in the * waiter, and has to be the absolute last thing we do. * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void folio_wake_bit(struct folio *folio, int bit_nr) { wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; spin_lock_irqsave(&q->lock, flags); __wake_up_locked_key(q, TASK_NORMAL, &key); /* * It's possible to miss clearing waiters here, when we woke our page * waiters, but the hashed waitqueue has waiters for other pages on it. * That's okay, it's a rare case. The next waker will clear it. * * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, * other), the flag may be cleared in the course of freeing the page; * but that is not required for correctness. */ if (!waitqueue_active(q) || !key.page_match) folio_clear_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } /* * A choice of three behaviors for folio_wait_bit_common(): */ enum behavior { EXCLUSIVE, /* Hold ref to page and take the bit when woken, like * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, * like folio_put_wait_locked() on PG_locked. */ }; /* * Attempt to check (or get) the folio flag, and mark us done * if successful. */ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { if (test_and_set_bit(bit_nr, &folio->flags)) return false; } else if (test_bit(bit_nr, &folio->flags)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } /* How many times do we accept lock stealing from under a waiter? */ int sysctl_page_lock_unfairness = 5; static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { wait_queue_head_t *q = folio_waitqueue(folio); int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; if (bit_nr == PG_locked && !folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = bit_nr; repeat: wait->flags = 0; if (behavior == EXCLUSIVE) { wait->flags = WQ_FLAG_EXCLUSIVE; if (--unfairness < 0) wait->flags |= WQ_FLAG_CUSTOM; } /* * Do one last check whether we can get the * page bit synchronously. * * Do the folio_set_waiters() marking before that * to let any waker we _just_ missed know they * need to wake us up (otherwise they'll never * even go to the slow case that looks at the * page queue), and add ourselves to the wait * queue if we need to sleep. * * This part needs to be done under the queue * lock to avoid races. */ spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, bit_nr, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * From now on, all the logic will be based on * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to * see whether the page bit testing has already * been done by the wake function. * * We can drop our reference to the folio. */ if (behavior == DROP) folio_put(folio); /* * Note that until the "finish_wait()", or until * we see the WQ_FLAG_WOKEN flag, we need to * be very careful with the 'wait->flags', because * we may race with a waker that sets them. */ for (;;) { unsigned int flags; set_current_state(state); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(state, current)) break; io_schedule(); continue; } /* If we were non-exclusive, we're done */ if (behavior != EXCLUSIVE) break; /* If the waker got the lock for us, we're done */ if (flags & WQ_FLAG_DONE) break; /* * Otherwise, if we're getting the lock, we need to * try to get it ourselves. * * And if that fails, we'll have to retry this all. */ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) goto repeat; wait->flags |= WQ_FLAG_DONE; break; } /* * If a signal happened, this 'finish_wait()' may remove the last * waiter from the wait-queues, but the folio waiters bit will remain * set. That's ok. The next wakeup will take care of it, and trying * to do it here would be difficult and prone to races. */ finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } /* * NOTE! The wait->flags weren't stable until we've done the * 'finish_wait()', and we could have exited the loop above due * to a signal, and had a wakeup event happen after the signal * test but before the 'finish_wait()'. * * So only after the finish_wait() can we reliably determine * if we got woken up or not, so we can now figure out the final * return value based on that state without races. * * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive * waiter, but an exclusive one requires WQ_FLAG_DONE. */ if (behavior == EXCLUSIVE) return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } #ifdef CONFIG_MIGRATION /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. * * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = PG_locked; wait->flags = 0; spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, PG_locked, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * If a migration entry exists for the page the migration path must hold * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ spin_unlock(ptl); for (;;) { unsigned int flags; set_current_state(TASK_UNINTERRUPTIBLE); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) break; io_schedule(); continue; } break; } finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } } #endif void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit); int folio_wait_bit_killable(struct folio *folio, int bit_nr) { return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit_killable); /** * folio_put_wait_locked - Drop a reference and wait for it to be unlocked * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not * dereference @folio. * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue * @folio: Folio defining the wait queue of interest * @waiter: Waiter to add to the queue * * Add an arbitrary @waiter to the wait queue for the nominated @folio. */ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) { wait_queue_head_t *q = folio_waitqueue(folio); unsigned long flags; spin_lock_irqsave(&q->lock, flags); __add_wait_queue_entry_tail(q, waiter); folio_set_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(folio_add_wait_queue); /** * folio_unlock - Unlock a locked folio. * @folio: The folio. * * Unlocks the folio and wakes up any thread sleeping on the page lock. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_unlock(struct folio *folio) { /* Bit 7 allows x86 to check the byte's sign bit */ BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); /** * folio_end_read - End read on a folio. * @folio: The folio. * @success: True if all reads completed successfully. * * When all reads against a folio have completed, filesystems should * call this function to let the pagecache know that no more reads * are outstanding. This will unlock the folio and wake up any thread * sleeping on the lock. The folio will also be marked uptodate if all * reads succeeded. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_end_read(struct folio *folio, bool success) { unsigned long mask = 1 << PG_locked; /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; if (folio_xor_flags_has_waiters(folio, mask)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_end_read); /** * folio_end_private_2 - Clear PG_private_2 and wake any waiters. * @folio: The folio. * * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for * it. The folio reference held for PG_private_2 being set is released. * * This is, for example, used when a netfs folio is being written to a local * disk cache, thereby allowing writes to the cache for the same folio to be * serialised. */ void folio_end_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); folio_wake_bit(folio, PG_private_2); folio_put(folio); } EXPORT_SYMBOL(folio_end_private_2); /** * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { while (folio_test_private_2(folio)) folio_wait_bit(folio, PG_private_2); } EXPORT_SYMBOL(folio_wait_private_2); /** * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is * received by the calling task. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ int folio_wait_private_2_killable(struct folio *folio) { int ret = 0; while (folio_test_private_2(folio)) { ret = folio_wait_bit_killable(folio, PG_private_2); if (ret < 0) break; } return ret; } EXPORT_SYMBOL(folio_wait_private_2_killable); /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. * * Context: May be called from process or interrupt context. */ void folio_end_writeback(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* * folio_test_clear_reclaim() could be used here but it is an * atomic operation and overkill in this particular case. Failing * to shuffle a folio marked for immediate reclaim is too mild * a gain to justify taking an atomic operation penalty at the * end of every folio writeback. */ if (folio_test_reclaim(folio)) { folio_clear_reclaim(folio); folio_rotate_reclaimable(folio); } /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. * But here we must make sure that the folio is not freed and * reused before the folio_wake_bit(). */ folio_get(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock */ void __folio_lock(struct folio *folio) { folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); } EXPORT_SYMBOL(__folio_lock); int __folio_lock_killable(struct folio *folio) { return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, EXCLUSIVE); } EXPORT_SYMBOL_GPL(__folio_lock_killable); static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { struct wait_queue_head *q = folio_waitqueue(folio); int ret; wait->folio = folio; wait->bit_nr = PG_locked; spin_lock_irq(&q->lock); __add_wait_queue_entry_tail(q, &wait->wait); folio_set_waiters(folio); ret = !folio_trylock(folio); /* * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's * safe to remove and return success, we know the callback * isn't going to trigger. */ if (!ret) __remove_wait_queue(q, &wait->wait); else ret = -EIOCBQUEUED; spin_unlock_irq(&q->lock); return ret; } /* * Return values: * 0 - folio is locked. * non-zero - folio is not locked. * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. */ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { unsigned int flags = vmf->flags; if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_lock/per-VMA lock is not * released even though returning VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return VM_FAULT_RETRY; release_fault_lock(vmf); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else folio_wait_locked(folio); return VM_FAULT_RETRY; } if (flags & FAULT_FLAG_KILLABLE) { bool ret; ret = __folio_lock_killable(folio); if (ret) { release_fault_lock(vmf); return VM_FAULT_RETRY; } } else { __folio_lock(folio); } return 0; } /** * page_cache_next_miss() - Find the next gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the * gap with the lowest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 5, then subsequently a gap is * created at index 10, page_cache_next_miss covering both indices may * return 10 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; if (xas.xa_index == 0) return 0; } return index + max_scan; } EXPORT_SYMBOL(page_cache_next_miss); /** * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [max(index - max_scan + 1, 0), index] for the * gap with the highest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 10, then subsequently a gap is * created at index 5, page_cache_prev_miss() covering both indices may * return 5 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) break; if (xas.xa_index == ULONG_MAX) break; } return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); /* * Lockless page cache protocol: * On the lookup side: * 1. Load the folio from i_pages * 2. Increment the refcount if it's not zero * 3. If the folio is not found by xas_reload(), put the refcount and retry * * On the removal side: * A. Freeze the page (by zeroing the refcount if nobody else has a reference) * B. Remove the page from i_pages * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by * folio_put(). */ /* * filemap_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * * Looks up the page cache entry at @mapping & @index. If it is a folio, * it is returned with an increased refcount. If it is a shadow entry * of a previously evicted folio, or a swap entry from shmem/tmpfs, * it is returned without further action. * * Return: The folio, swap or shadow entry, %NULL if nothing is found. */ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); repeat: xas_reset(&xas); folio = xas_load(&xas); if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ if (!folio || xa_is_value(folio)) goto out; if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); goto repeat; } out: rcu_read_unlock(); return folio; } /** * __filemap_get_folio - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. * * Looks up the page cache entry at @mapping & @index. * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * * If this function returns a folio, it is returned with an increased refcount. * * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; repeat: folio = filemap_get_entry(mapping, index); if (xa_is_value(folio)) folio = NULL; if (!folio) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EAGAIN); } } else { folio_lock(folio); } /* Has the page been truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); } if (fgp_flags & FGP_ACCESSED) folio_mark_accessed(folio); else if (fgp_flags & FGP_WRITE) { /* Clear idle flag for buffer write */ if (folio_test_idle(folio)) folio_clear_idle(folio); } if (fgp_flags & FGP_STABLE) folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { unsigned int min_order = mapping_min_folio_order(mapping); unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT | __GFP_NOWARN; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; if (order > mapping_max_folio_order(mapping)) order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); do { gfp_t alloc_gfp = gfp; err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); if (!folio) continue; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) break; folio_put(folio); folio = NULL; } while (order-- > min_order); if (err == -EEXIST) goto repeat; if (err) return ERR_PTR(err); /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. */ if (folio && (fgp_flags & FGP_FOR_MMAP)) folio_unlock(folio); } if (!folio) return ERR_PTR(-ENOENT); return folio; } EXPORT_SYMBOL(__filemap_get_folio); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { struct folio *folio; retry: if (mark == XA_PRESENT) folio = xas_find(xas, max); else folio = xas_find_marked(xas, max, mark); if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ if (!folio || xa_is_value(folio)) return folio; if (!folio_try_get(folio)) goto reset; if (unlikely(folio != xas_reload(xas))) { folio_put(folio); goto reset; } return folio; reset: xas_reset(xas); goto retry; } /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in * the mapping. The entries are placed in @fbatch. find_get_entries() * takes a reference on any actual folios it returns. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries or large folios. * * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; } if (folio_batch_count(fbatch)) { unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); else nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); *start = round_down(indices[idx] + nr, nr); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * find_lock_entries - Find a batch of pagecache entries. * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. * Swap, shadow and DAX entries are included. Folios are returned * locked and with an incremented refcount. Folios which are locked * by somebody else or under writeback are skipped. Folios which are * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries, large folios, folios which could not be * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { unsigned long base; unsigned long nr; if (!xa_is_value(folio)) { nr = folio_nr_pages(folio); base = folio->index; /* Omit large folio which begins before the start */ if (base < *start) goto put; /* Omit large folio which extends beyond the end */ if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; if (folio->mapping != mapping || folio_test_writeback(folio)) goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); } else { nr = 1 << xas_get_order(&xas); base = xas.xa_index & ~(nr - 1); /* Omit order>0 value which begins before the start */ if (base < *start) continue; /* Omit order>0 value which extends beyond the end */ if (base + nr - 1 > end) break; } /* Update start now so that last update is correct on return */ *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; continue; unlock: folio_unlock(folio); put: folio_put(folio); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * filemap_get_folios - Get a batch of folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill. * * Search for and return a batch of folios in the mapping starting at * index @start and up to index @end (inclusive). The folios are returned * in @fbatch with an elevated reference count. * * Return: The number of folios which were found. * We also update @start to index the next folio for the traversal. */ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); } EXPORT_SYMBOL(filemap_get_folios); /** * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill * * filemap_get_folios_contig() works exactly like filemap_get_folios(), * except the returned folios are guaranteed to be contiguous. This may * not return all contiguous folios if the batch gets filled up. * * Return: The number of folios found. * Also update @start to be positioned for traversal of the next folio. */ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); unsigned long nr; struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio && xas.xa_index <= end; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) goto update_start; /* If we landed in the middle of a THP, continue at its end. */ if (xa_is_sibling(folio)) goto update_start; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } update_start: nr = folio_batch_count(fbatch); if (nr) { folio = fbatch->folios[nr - 1]; *start = folio_next_index(folio); } out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_contig); /** * filemap_get_folios_tag - Get a batch of folios matching @tag * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @tag: The tag index * @fbatch: The batch to fill * * The first folio may start before @start; if it does, it will contain * @start. The final folio may extend beyond @end; if it does, it will * contain @end. The folios have ascending indices. There may be gaps * between the folios if there are indices which have no folio in the * page cache. If folios are added to or removed from the page cache * while this is running, they may or may not be found by this call. * Only returns folios that are tagged with @tag. * * Return: The number of folios found. * Also update @start to index the next folio for traversal. */ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, tag)) != NULL) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ if (xa_is_value(folio)) continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This * breaks the iteration when there is a page at index -1 but that is * already broke anyway. */ if (end == (pgoff_t)-1) *start = (pgoff_t)-1; else *start = end + 1; out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: * * ---R__________________________________________B__________ * ^ reading here ^ bad block(assume 4k) * * read(R) => miss => readahead(R...B) => media error => frustrating retries * => failing the whole request => read(R) => read(R+1) => * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... * * It is going insane. Fix it by quickly scaling down the readahead size. */ static void shrink_readahead_size_eio(struct file_ra_state *ra) { ra->ra_pages /= 4; } /* * filemap_get_read_batch - Get a batch of folios for read * * Get a batch of folios which represent a contiguous range of bytes in * the file. No exceptional entries will be returned. If @index is in * the middle of a folio, the entire folio will be returned. The last * folio in the batch may have the readahead flag set or the uptodate flag * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; if (xas.xa_index > max || xa_is_value(folio)) break; if (xa_is_sibling(folio)) break; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) break; if (!folio_test_uptodate(folio)) break; if (folio_test_readahead(folio)) break; xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); unsigned long pflags; int error; /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); error = filler(file, folio); if (unlikely(workingset)) psi_memstall_leave(&pflags); if (error) return error; error = folio_wait_locked_killable(folio); if (error) return error; if (folio_test_uptodate(folio)) return 0; if (file) shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, loff_t pos, size_t count, struct folio *folio, bool need_uptodate) { if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; } else { pos -= folio_pos(folio); } return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, size_t count, struct folio *folio, bool need_uptodate) { int error; if (iocb->ki_flags & IOCB_NOWAIT) { if (!filemap_invalidate_trylock_shared(mapping)) return -EAGAIN; } else { filemap_invalidate_lock_shared(mapping); } if (!folio_trylock(folio)) { error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); /* * This is where we usually end up waiting for a * previously submitted readahead to finish. */ folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); if (error) goto unlock_mapping; } error = AOP_TRUNCATED_PAGE; if (!folio->mapping) goto unlock; error = 0; if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, need_uptodate)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); goto unlock_mapping; unlock: folio_unlock(folio); unlock_mapping: filemap_invalidate_unlock_shared(mapping); if (error == AOP_TRUNCATED_PAGE) folio_put(folio); return error; } static int filemap_create_folio(struct file *file, struct address_space *mapping, loff_t pos, struct folio_batch *fbatch) { struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; /* * Protect against truncate / hole punch. Grabbing invalidate_lock * here assures we cannot instantiate and bring uptodate new * pagecache folios after evicting page cache during truncate * and before actually freeing blocks. Note that we could * release invalidate_lock after inserting the folio into * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); index = (pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, struct address_space *mapping, struct folio *folio, pgoff_t last_index) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) { unsigned int shift = folio_shift(folio); return (pos1 >> shift == pos2 >> shift); } /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. * @iter: Destination for the data. * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns * a negative error number. */ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos < 0)) return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { cond_resched(); /* * If we've already successfully copied some data, then we * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(mapping); /* * When a read accesses the same folio several times, only * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); size_t copied; if (end_offset < folio_pos(folio)) break; if (i > 0) folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; break; } } put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) folio_put(fbatch.folios[i]); folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); ra->prev_pos = last_pos; return already_read ? already_read : error; } EXPORT_SYMBOL_GPL(filemap_read); int kiocb_write_and_wait(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos; loff_t end = pos + count - 1; if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, pos, end)) return -EAGAIN; return 0; } return filemap_write_and_wait_range(mapping, pos, end); } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); int filemap_invalidate_pages(struct address_space *mapping, loff_t pos, loff_t end, bool nowait) { int ret; if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; } else { ret = filemap_write_and_wait_range(mapping, pos, end); if (ret) return ret; } /* * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; return filemap_invalidate_pages(mapping, iocb->ki_pos, iocb->ki_pos + count - 1, iocb->ki_flags & IOCB_NOWAIT); } EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block * @iter: destination for the data read * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. * * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall * be returned when no data can be read without waiting for I/O requests * to complete; it doesn't prevent readahead. * * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O * requests shall be made for the read or for readahead. When no data * can be read, -EAGAIN shall be returned. When readahead would be * triggered, a partial, possibly empty read shall be returned. * * Return: * * number of bytes copied, even for partial reads * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { size_t count = iov_iter_count(iter); ssize_t retval = 0; if (!count) return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; retval = kiocb_write_and_wait(iocb, count); if (retval < 0) return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); if (retval >= 0) { iocb->ki_pos += retval; count -= retval; } if (retval != -EIOCBQUEUED) iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter * compressed extents, so if there was an error, or if * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ if (retval < 0 || !count || IS_DAX(inode)) return retval; if (iocb->ki_pos >= i_size_read(inode)) return retval; } return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); /* * Splice subpages from a folio into a pipe. */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size) { struct page *page; size_t spliced = 0, offset = offset_in_folio(folio, fpos); page = folio_page(folio, offset / PAGE_SIZE); size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; while (spliced < size && !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); *buf = (struct pipe_buffer) { .ops = &page_cache_pipe_buf_ops, .page = page, .offset = offset, .len = part, }; folio_get(folio); pipe->head++; page++; spliced += part; offset = 0; } return spliced; } /** * filemap_splice_read - Splice data from a file's pagecache into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function gets folios from a file's pagecache and splices them into the * pipe. Readahead will be called as necessary to fill more folios. This may * be used for blockdevs also. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct folio_batch fbatch; struct kiocb iocb; size_t total_spliced = 0, used, npages; loff_t isize, end_offset; bool writably_mapped; int i, error = 0; if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) return 0; init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ used = pipe_occupancy(pipe->head, pipe->tail); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); folio_batch_init(&fbatch); do { cond_resched(); if (*ppos >= i_size_read(in->f_mapping->host)) break; iocb.ki_pos = *ppos; error = filemap_get_pages(&iocb, len, &fbatch, true); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) break; end_offset = min_t(loff_t, isize, *ppos + len); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(in->f_mapping); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t n; if (folio_pos(folio) >= end_offset) goto out; folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); n = min_t(loff_t, len, isize - *ppos); n = splice_folio_into_pipe(pipe, folio, *ppos, n); if (!n) goto out; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) goto out; } folio_batch_release(&fbatch); } while (len); out: folio_batch_release(&fbatch); file_accessed(in); return total_spliced ? total_spliced : error; } EXPORT_SYMBOL(filemap_splice_read); static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); folio_lock(folio); if (unlikely(folio->mapping != mapping)) goto unlock; offset = offset_in_folio(folio, start) & ~(bsz - 1); do { if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; start = (start + bsz) & ~(bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: folio_unlock(folio); rcu_read_lock(); return start; } static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } /** * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. * @mapping: Address space to search. * @start: First byte to consider. * @end: Limit of search (exclusive). * @whence: Either SEEK_HOLE or SEEK_DATA. * * If the page cache knows which blocks contain holes and which blocks * contain data, your filesystem can use this function to implement * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are * entirely memory-based such as tmpfs, and filesystems which support * unwritten extents. * * Return: The requested offset on success, or -ENXIO if @whence specifies * SEEK_DATA and there is no data after @start. There is an implicit hole * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start * and @end contain data. */ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; size_t seek_size; if (start < pos) { if (!seek_data) goto unlock; start = pos; } seek_size = seek_folio_size(&xas, folio); pos = round_up((u64)pos + 1, seek_size); start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; if (start >= end) break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(folio)) folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); if (folio && !xa_is_value(folio)) folio_put(folio); if (start > end) return end; return start; } #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * * This works similar to lock_folio_or_retry in that it can drop the * mmap_lock. It differs in that it actually returns the folio locked * if it returns 1 and 0 if it couldn't lock the folio. If we did have * to drop the mmap_lock then fpin will point to the pinned file and * needs to be fput()'ed at a later point. */ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { if (folio_trylock(folio)) return 1; /* * NOTE! This will make us return with VM_FAULT_RETRY, but with * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) return 0; *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* * We didn't have the right flags to drop the * fault lock, but all fault_handlers only check * for fatal signals if we return VM_FAULT_RETRY, * so we need to drop the fault lock here and * return 0 if we don't have a fpin. */ if (*fpin == NULL) release_fault_lock(vmf); return 0; } } else __folio_lock(folio); return 1; } /* * Synchronous readahead happens when we don't even find a page in the page * cache at all. We don't want to perform IO under the mmap sem, so if we have * to drop the mmap sem we return the file that was pinned in order for us to do * that. If we didn't pin a file then we return NULL. The file that is * returned needs to be fput()'ed when we're done with it. */ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; /* * Fetch two PMD folios, so we get the chance to actually * readahead, unless we've been told not to. */ if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); return fpin; } #endif /* If we don't want any read-ahead, don't bother */ if (vm_flags & VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; if (vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } /* Avoid banging the cache line if not needed */ mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss < MMAP_LOTSAMISS * 10) WRITE_ONCE(ra->mmap_miss, ++mmap_miss); /* * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ if (mmap_miss > MMAP_LOTSAMISS) return fpin; /* * mmap read-around */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; page_cache_ra_order(&ractl, ra, 0); return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, * so we want to possibly extend the readahead further. We return the file that * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; pte_t *ptep; /* * We might have COW'ed a pagecache folio and might now have an mlocked * anon folio mapped. The original pagecache folio is not mlocked and * might have been evicted. During a read+clear/modify/write update of * the PTE, such as done in do_numa_page()/change_pte_range(), we * temporarily clear the PTE under PT lock and might detect it here as * "none" when not holding the PT lock. * * Not rechecking the PTE under PT lock could result in an unexpected * major fault in an mlock'ed region. Recheck only for this special * scenario while holding the PT lock, to not degrade non-mlocked * scenarios. Recheck the PTE without PT lock firstly, thereby reducing * the number of times we hold PT lock. */ if (!(vma->vm_flags & VM_LOCKED)) return 0; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { ret = VM_FAULT_NOPAGE; } else { spin_lock(vmf->ptl); if (unlikely(!pte_none(ptep_get(ptep)))) ret = VM_FAULT_NOPAGE; spin_unlock(vmf->ptl); } pte_unmap(ptep); return ret; } /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. * * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; pgoff_t max_idx, index = vmf->pgoff; struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; trace_mm_filemap_fault(mapping, index); /* * Do we have something in the page cache already? */ folio = filemap_get_folio(mapping, index); if (likely(!IS_ERR(folio))) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) fpin = do_async_mmap_readahead(vmf, folio); if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } } else { ret = filemap_fault_recheck_pte_none(vmf); if (unlikely(ret)) return ret; /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; fpin = do_sync_mmap_readahead(vmf); retry_find: /* * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); if (IS_ERR(folio)) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_OOM; } } if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto retry_find; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked folio in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error, * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop * everything, get the invalidate lock and try again. */ if (!mapping_locked) { folio_unlock(folio); folio_put(folio); goto retry_find; } /* * OK, the folio is really not uptodate. This can be because the * VMA has the VM_RAND_READ flag set, or because an error * arose. Let's read it in directly. */ goto page_not_uptodate; } /* * We've made it this far and we had to drop our mmap_lock, now is the * time to return to the upper layer and have it re-find the vma and * redo the fault. */ if (fpin) { folio_unlock(folio); goto out_retry; } if (mapping_locked) filemap_invalidate_unlock_shared(mapping); /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) { folio_unlock(folio); folio_put(folio); return VM_FAULT_SIGBUS; } vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, * because there really aren't any performance issues here * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (fpin) goto out_retry; folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_SIGBUS; out_retry: /* * We dropped the mmap_lock, we need to return to the fault handler to * re-find the vma and come back and find our hopefully still populated * page. */ if (!IS_ERR(folio)) folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) fput(fpin); return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { folio_unlock(folio); folio_put(folio); return true; } if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); return true; } } if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); return false; } static struct folio *next_uptodate_folio(struct xa_state *xas, struct address_space *mapping, pgoff_t end_pgoff) { struct folio *folio = xas_next_entry(xas, end_pgoff); unsigned long max_idx; do { if (!folio) return NULL; if (xas_retry(xas, folio)) continue; if (xa_is_value(folio)) continue; if (!folio_try_get(folio)) continue; if (folio_test_locked(folio)) goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; if (!folio_trylock(folio)) goto skip; if (folio->mapping != mapping) goto unlock; if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; return folio; unlock: folio_unlock(folio); skip: folio_put(folio); } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } /* * Map page range [start_page, start_page + nr_pages) of folio. * start_page is gotten from start by folio_page(folio, start) */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; do { if (PageHWPoison(page + count)) goto skip; /* * If there are too many folios that are recently evicted * in a file, they will probably continue to be evicted. * In such situation, read-ahead is only a waste of IO. * Don't decrease mmap_miss in this scenario to make sure * we can stop read-ahead. */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; continue; skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } count++; page += count; vmf->pte += count; addr += count * PAGE_SIZE; count = 0; } while (--nr_pages > 0); if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; return ret; } static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; if (PageHWPoison(page)) return ret; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) return ret; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; folio_ref_inc(folio); return ret; } vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) { folio_unlock(folio); folio_put(folio); goto out; } file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; if (end_pgoff > file_end) end_pgoff = file_end; folio_type = mm_counter_file(folio); do { unsigned long end; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, nr_pages, &rss, &mmap_miss); folio_unlock(folio); folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); if (mmap_miss >= mmap_miss_saved) WRITE_ONCE(file->f_ra.mmap_miss, 0); else WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); return ret; } EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will * see the dirty folio and writeprotect it again. */ folio_mark_dirty(folio); folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; } const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, }; /* This is used for a general mmap of a disk file */ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; return 0; } /* * This is for filesystems which do not implement ->writepage. */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) { struct folio *folio; int err; if (!filler) filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping)); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } goto filler; } if (folio_test_uptodate(folio)) goto out; if (!folio_trylock(folio)) { folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); goto repeat; } /* Folio was truncated from mapping */ if (!folio->mapping) { folio_unlock(folio); folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ if (folio_test_uptodate(folio)) { folio_unlock(folio); goto out; } filler: err = filemap_read_folio(file, filler, folio); if (err) { folio_put(folio); if (err == AOP_TRUNCATED_PAGE) goto repeat; return ERR_PTR(err); } out: folio_mark_accessed(folio); return folio; } /** * read_cache_folio - Read into page cache, fill it if needed. * @mapping: The address_space to read from. * @index: The index to read. * @filler: Function to perform the read, or NULL to use aops->read_folio(). * @file: Passed to filler function, may be NULL if not required. * * Read one page into the page cache. If it succeeds, the folio returned * will contain @index, but it may not be the first page of the folio. * * If the filler function returns an error, it will be returned to the * caller. * * Context: May sleep. Expects mapping->invalidate_lock to be held. * Return: An uptodate folio on success, ERR_PTR() on failure. */ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file) { return do_read_cache_folio(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_folio); /** * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. * @mapping: The address_space for the folio. * @index: The index that the allocated folio will contain. * @gfp: The page allocator flags to use if allocating. * * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with * any new memory allocations done using the specified allocation flags. * * The most likely error from this function is EIO, but ENOMEM is * possible and so is EINTR. If ->read_folio returns another error, * that will be returned to the caller. * * The function expects mapping->invalidate_lock to be already held. * * Return: Uptodate folio on success, ERR_PTR() on failure. */ struct folio *mapping_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_folio(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(mapping_read_folio_gfp); static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { struct folio *folio; folio = do_read_cache_folio(mapping, index, filler, file, gfp); if (IS_ERR(folio)) return &folio->page; return folio_file_page(folio, index); } struct page *read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file) { return do_read_cache_page(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. * @mapping: the page's address_space * @index: the page index * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. * * The function expects mapping->invalidate_lock to be already held. * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_page(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; char *path; errseq_set(&filp->f_mapping->wb_err, -EIO); if (__ratelimit(&_rs)) { path = file_path(filp, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, current->comm); } } void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (mapping->nrpages && invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) dio_warn_stale_pagecache(iocb->ki_filp); } ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct address_space *mapping = iocb->ki_filp->f_mapping; size_t write_len = iov_iter_count(from); ssize_t written; /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; return written; } written = mapping->a_ops->direct_IO(iocb, from); /* * Finally, try again to invalidate clean pages which might have been * cached by non-direct readahead, or faulted in by get_user_pages() * if the source of the write was an mmap'ed region of the file * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... * * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break * them by removing it completely. * * Noticeable example is a blkdev_direct_IO(). * * Skip invalidation for async writes or if mapping has no pages. */ if (written > 0) { struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos; kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } iocb->ki_pos = pos; } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); return written; } EXPORT_SYMBOL(generic_file_direct_write); ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; size_t chunk = mapping_max_folio_size(mapping); long status = 0; ssize_t written = 0; do { struct folio *folio; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ void *fsdata = NULL; bytes = iov_iter_count(i); retry: offset = pos & (chunk - 1); bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; break; } if (fatal_signal_pending(current)) { status = -EINTR; break; } status = a_ops->write_begin(file, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); status = a_ops->write_end(file, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) break; } cond_resched(); if (unlikely(status == 0)) { /* * A short copy made ->write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { bytes = copied; goto retry; } } else { pos += status; written += status; } } while (iov_iter_count(i)); if (!written) return status; iocb->ki_pos += written; return written; } EXPORT_SYMBOL(generic_perform_write); /** * __generic_file_write_iter - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates * modification times and calls proper subroutines depending on whether we * do direct IO or a standard buffered write. * * It expects i_rwsem to be grabbed unless we work on a block device or similar * object which does not need locking at all. * * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_rwsem. * * Return: * * number of bytes written, even for truncated writes * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; ret = file_remove_privs(file); if (ret) return ret; ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to * holes, for example. For DAX files, a buffered write will * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) return ret; return direct_write_fallback(iocb, from, ret, generic_perform_write(iocb, from)); } return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); /** * generic_file_write_iter - write data to a file * @iocb: IO state structure * @from: iov_iter with data to write * * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_rwsem as needed. * Return: * * negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } EXPORT_SYMBOL(generic_file_write_iter); /** * filemap_release_folio() - Release fs-specific metadata on a folio. * @folio: The folio which the kernel is trying to free. * @gfp: Memory allocation flags (and I/O mode). * * The address_space is trying to release any data attached to a folio * (presumably at folio->private). * * This will also be called if the private_2 flag is set on a page, * indicating that the folio has other metadata associated with it. * * The @gfp argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block * (__GFP_RECLAIM & __GFP_FS). * * Return: %true if the release was successful, otherwise %false. */ bool filemap_release_folio(struct folio *folio, gfp_t gfp) { struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); if (!folio_needs_release(folio)) return true; if (folio_test_writeback(folio)) return false; if (mapping && mapping->a_ops->release_folio) return mapping->a_ops->release_folio(folio, gfp); return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); /** * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache * @inode: The inode to flush * @flush: Set to write back rather than simply invalidate. * @start: First byte to in range. * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start * onwards. * * Invalidate all the folios on an inode that contribute to the specified * range, possibly writing them back first. Whilst the operation is * undertaken, the invalidate lock is held to prevent new folios from being * installed. */ int filemap_invalidate_inode(struct inode *inode, bool flush, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; pgoff_t first = start >> PAGE_SHIFT; pgoff_t last = end >> PAGE_SHIFT; pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; if (!mapping || !mapping->nrpages || end < start) goto out; /* Prevent new folios from being added to the inode. */ filemap_invalidate_lock(mapping); if (!mapping->nrpages) goto unlock; unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ if (flush) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; filemap_fdatawrite_wbc(mapping, &wbc); } /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); unlock: filemap_invalidate_unlock(mapping); out: return filemap_check_errors(mapping); } EXPORT_SYMBOL_GPL(filemap_invalidate_inode); #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping * @mapping: The mapping to compute the statistics for. * @first_index: The starting page cache index. * @last_index: The final page index (inclusive). * @cs: the cachestat struct to write the result to. * * This will query the page cache statistics of a mapping in the * page range of [first_index, last_index] (inclusive). The statistics * queried include: number of dirty pages, number of pages marked for * writeback, and the number of (recently) evicted pages. */ static void filemap_cachestat(struct address_space *mapping, pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) { XA_STATE(xas, &mapping->i_pages, first_index); struct folio *folio; /* Flush stats (and potentially sleep) outside the RCU read section. */ mem_cgroup_flush_stats_ratelimited(NULL); rcu_read_lock(); xas_for_each(&xas, folio, last_index) { int order; unsigned long nr_pages; pgoff_t folio_first_index, folio_last_index; /* * Don't deref the folio. It is not pinned, and might * get freed (and reused) underneath us. * * We *could* pin it, but that would be expensive for * what should be a fast and lightweight syscall. * * Instead, derive all information of interest from * the rcu-protected xarray. */ if (xas_retry(&xas, folio)) continue; order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; /* Folios might straddle the range boundaries, only count covered pages */ if (folio_first_index < first_index) nr_pages -= first_index - folio_first_index; if (folio_last_index > last_index) nr_pages -= folio_last_index - last_index; if (xa_is_value(folio)) { /* page is evicted */ void *shadow = (void *)folio; bool workingset; /* not used */ cs->nr_evicted += nr_pages; #ifdef CONFIG_SWAP /* implies CONFIG_MMU */ if (shmem_mapping(mapping)) { /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ if (non_swap_entry(swp)) goto resched; /* * Getting a swap entry from the shmem * inode means we beat * shmem_unuse(). rcu_read_lock() * ensures swapoff waits for us before * freeing the swapper space. However, * we can race with swapping and * invalidation, so there might not be * a shadow in the swapcache (yet). */ shadow = get_shadow_from_swap_cache(swp); if (!shadow) goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset, false)) cs->nr_recently_evicted += nr_pages; goto resched; } /* page is in cache */ cs->nr_cache += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY)) cs->nr_dirty += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK)) cs->nr_writeback += nr_pages; resched: if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); } /* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the * bytes range specified by `off` and `len`: number of cached pages, * number of dirty pages, number of pages marked for writeback, * number of evicted pages, and number of recently evicted pages. * * An evicted page is a page that is previously in the page cache * but has been evicted since. A page is recently evicted if its last * eviction was recent enough that its reentry to the cache would * indicate that it is actively being used by the system, and that * there is memory pressure on the system. * * `off` and `len` must be non-negative integers. If `len` > 0, * the queried range is [`off`, `off` + `len`]. If `len` == 0, * we will query in the range from `off` to the end of the file. * * The `flags` argument is unused for now, but is included for future * extensibility. User should pass 0 (i.e no flag specified). * * Currently, hugetlbfs is not supported. * * Because the status of a page can change after cachestat() checks it * but before it returns to the application, the returned values may * contain stale information. * * return values: * zero - success * -EFAULT - cstat or cstat_range points to an illegal address * -EINVAL - invalid flags * -EBADF - invalid file descriptor * -EOPNOTSUPP - file descriptor is of a hugetlbfs file */ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { CLASS(fd, f)(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; if (fd_empty(f)) return -EBADF; if (copy_from_user(&csr, cstat_range, sizeof(struct cachestat_range))) return -EFAULT; /* hugetlbfs is not supported */ if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; if (flags != 0) return -EINVAL; first_index = csr.off >> PAGE_SHIFT; last_index = csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; memset(&cs, 0, sizeof(struct cachestat)); mapping = fd_file(f)->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; return 0; } #endif /* CONFIG_CACHESTAT_SYSCALL */ |
1 1 1 1 1 1 1 1 1 1 4 4 4 24 23 23 23 4 19 1 1 16 1 1 1 1 1 1 1 1 1 20 1 19 2 2 1 1 1 1 1 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_prio.c Simple 3-band priority "scheduler". * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: * Init -- EINVAL when opt undefined */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct prio_sched_data { int bands; struct tcf_proto __rcu *filter_list; struct tcf_block *block; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; static struct Qdisc * prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return q->queues[q->prio2band[band & TC_PRIO_MAX]]; } band = res.classid; } band = TC_H_MIN(band) - 1; if (band >= q->bands) return q->queues[q->prio2band[0]]; return q->queues[band]; } static int prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct Qdisc *qdisc; int ret; qdisc = prio_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *prio_peek(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc->ops->peek(qdisc); if (skb) return skb; } return NULL; } static struct sk_buff *prio_dequeue(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } } return NULL; } static void prio_reset(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); } static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt) { struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload opt = { .handle = sch->handle, .parent = sch->parent, }; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; if (qopt) { opt.command = TC_PRIO_REPLACE; opt.replace_params.bands = qopt->bands; memcpy(&opt.replace_params.priomap, qopt->priomap, TC_PRIO_MAX + 1); opt.replace_params.qstats = &sch->qstats; } else { opt.command = TC_PRIO_DESTROY; } return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, &opt); } static void prio_destroy(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) qdisc_put(q->queues[prio]); } static int prio_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *queues[TCQ_PRIO_BANDS]; int oldbands = q->bands, i; struct tc_prio_qopt *qopt; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < TCQ_MIN_PRIO_BANDS) return -EINVAL; for (i = 0; i <= TC_PRIO_MAX; i++) { if (qopt->priomap[i] >= qopt->bands) return -EINVAL; } /* Before commit, make sure we can allocate all new qdiscs */ for (i = oldbands; i < qopt->bands; i++) { queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (!queues[i]) { while (i > oldbands) qdisc_put(queues[--i]); return -ENOMEM; } } prio_offload(sch, qopt); sch_tree_lock(sch); q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) qdisc_tree_flush_backlog(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; if (q->queues[i] != &noop_qdisc) qdisc_hash_add(q->queues[i], true); } sch_tree_unlock(sch); for (i = q->bands; i < oldbands; i++) qdisc_put(q->queues[i]); return 0; } static int prio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); int err; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; return prio_tune(sch, opt, extack); } static int prio_dump_offload(struct Qdisc *sch) { struct tc_prio_qopt_offload hw_stats = { .command = TC_PRIO_STATS, .handle = sch->handle, .parent = sch->parent, { .stats = { .bstats = &sch->bstats, .qstats = &sch->qstats, }, }, }; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats); } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct prio_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_prio_qopt opt; int err; opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); err = prio_dump_offload(sch); if (err) goto nla_put_failure; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt_offload graft_offload; unsigned long band = arg - 1; if (!new) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, arg), extack); if (!new) new = &noop_qdisc; else qdisc_hash_add(new, true); } *old = qdisc_replace(sch, new, &q->queues[band]); graft_offload.handle = sch->handle; graft_offload.parent = sch->parent; graft_offload.graft_params.band = band; graft_offload.graft_params.child_handle = new->handle; graft_offload.command = TC_PRIO_GRAFT; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, TC_SETUP_QDISC_PRIO, &graft_offload, extack); return 0; } static struct Qdisc * prio_leaf(struct Qdisc *sch, unsigned long arg) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long prio_find(struct Qdisc *sch, u32 classid) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return prio_find(sch, classid); } static void prio_unbind(struct Qdisc *q, unsigned long cl) { } static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct prio_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl-1]->handle; return 0; } static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct prio_sched_data *q = qdisc_priv(sch); int prio; if (arg->stop) return; for (prio = 0; prio < q->bands; prio++) { if (!tc_qdisc_stats_dump(sch, prio + 1, arg)) break; } } static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops prio_class_ops = { .graft = prio_graft, .leaf = prio_leaf, .find = prio_find, .walk = prio_walk, .tcf_block = prio_tcf_block, .bind_tcf = prio_bind, .unbind_tcf = prio_unbind, .dump = prio_dump_class, .dump_stats = prio_dump_class_stats, }; static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &prio_class_ops, .id = "prio", .priv_size = sizeof(struct prio_sched_data), .enqueue = prio_enqueue, .dequeue = prio_dequeue, .peek = prio_peek, .init = prio_init, .reset = prio_reset, .destroy = prio_destroy, .change = prio_tune, .dump = prio_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("prio"); static int __init prio_module_init(void) { return register_qdisc(&prio_qdisc_ops); } static void __exit prio_module_exit(void) { unregister_qdisc(&prio_qdisc_ops); } module_init(prio_module_init) module_exit(prio_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Simple 3-band priority qdisc"); |
1 4 2 1 1 2 2 2 5 5 1 4 5 5 11 1 1 2 2 1 10 8 1 1 7 2 5 7 3 1 1 1 3 1 2 2 2 1 1 1 2 2 2 10 1 1 1 7 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 2 1 2 1 7 5 2 2 1 22 1 21 9 1 1 1 1 1 3 2 2 2 876 852 46 23 3 1 1 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 | // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN * * This implementation does not provide ISO-TP specific return values to the * userspace. * * - RX path timeout of data reception leads to -ETIMEDOUT * - RX path SN mismatch leads to -EILSEQ * - RX path data reception with wrong padding leads to -EBADMSG * - TX path flowcontrol reception timeout leads to -ECOMM * - TX path flowcontrol reception overflow leads to -EMSGSIZE * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG * - when a transfer (tx) is on the run the next write() blocks until it's done * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/wait.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN ISO 15765-2 transport protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS("can-proto-6"); #define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp) #define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) /* Since ISO 15765-2:2016 the CAN isotp protocol supports more than 4095 * byte per ISO PDU as the FF_DL can take full 32 bit values (4 Gbyte). * We would need some good concept to handle this between user space and * kernel space. For now set the static buffer to something about 8 kbyte * to be able to test this new functionality. */ #define DEFAULT_MAX_PDU_SIZE 8300 /* maximum PDU size before ISO 15765-2:2016 extension was 4095 */ #define MAX_12BIT_PDU_SIZE 4095 /* limit the isotp pdu size from the optional module parameter to 1MByte */ #define MAX_PDU_SIZE (1025 * 1024U) static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE; module_param(max_pdu_size, uint, 0444); MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default " __stringify(DEFAULT_MAX_PDU_SIZE) ")"); /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ #define N_PCI_FF 0x10 /* first frame */ #define N_PCI_CF 0x20 /* consecutive frame */ #define N_PCI_FC 0x30 /* flow control */ #define N_PCI_SZ 1 /* size of the PCI byte #1 */ #define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ #define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ #define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ #define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) #define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ #define ISOTP_FC_WT 1 /* wait */ #define ISOTP_FC_OVFLW 2 /* overflow */ #define ISOTP_FC_TIMEOUT 1 /* 1 sec */ #define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */ enum { ISOTP_IDLE = 0, ISOTP_WAIT_FIRST_FC, ISOTP_WAIT_FC, ISOTP_WAIT_DATA, ISOTP_SENDING, ISOTP_SHUTDOWN, }; struct tpcon { u8 *buf; unsigned int buflen; unsigned int len; unsigned int idx; u32 state; u8 bs; u8 sn; u8 ll_dl; u8 sbuf[DEFAULT_MAX_PDU_SIZE]; }; struct isotp_sock { struct sock sk; int bound; int ifindex; canid_t txid; canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); static DEFINE_SPINLOCK(isotp_notifier_lock); static struct isotp_sock *isotp_busy_notifier; static inline struct isotp_sock *isotp_sk(const struct sock *sk) { return (struct isotp_sock *)sk; } static u32 isotp_bc_flags(struct isotp_sock *so) { return so->opt.flags & ISOTP_ALL_BC_FLAGS; } static bool isotp_register_rxid(struct isotp_sock *so) { /* no broadcast modes => register rx_id for FC frame reception */ return (isotp_bc_flags(so) == 0); } static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, rxtimer); struct sock *sk = &so->sk; if (so->rx.state == ISOTP_WAIT_DATA) { /* we did not get new data frames in time */ /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; } return HRTIMER_NORESTART; } static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); if (!nskb) return 1; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } can_skb_reserve(nskb); can_skb_prv(nskb)->ifindex = dev->ifindex; can_skb_prv(nskb)->skbcnt = 0; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; skb_put_zero(nskb, so->ll.mtu); /* create & send flow control reply */ ncf->can_id = so->txid; if (so->opt.flags & CAN_ISOTP_TX_PADDING) { memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); ncf->len = CAN_MAX_DLEN; } else { ncf->len = ae + FC_CONTENT_SZ; } ncf->data[ae] = N_PCI_FC | flowstatus; ncf->data[ae + 1] = so->rxfc.bs; ncf->data[ae + 2] = so->rxfc.stmin; if (ae) ncf->data[0] = so->opt.ext_address; ncf->flags = so->ll.tx_flags; can_send_ret = can_send(nskb, 1); if (can_send_ret) pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); dev_put(dev); /* reset blocksize counter */ so->rx.bs = 0; /* reset last CF frame rx timestamp for rx stmin enforcement */ so->lastrxcf_tstamp = ktime_set(0, 0); /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) { struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static u8 padlen(u8 datalen) { static const u8 plen[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ 12, 12, 12, 12, /* 9 - 12 */ 16, 16, 16, 16, /* 13 - 16 */ 20, 20, 20, 20, /* 17 - 20 */ 24, 24, 24, 24, /* 21 - 24 */ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */ }; if (datalen > 48) return 64; return plen[datalen]; } /* check for length optimization and return 1/true when the check fails */ static int check_optimized(struct canfd_frame *cf, int start_index) { /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the * padding would start at this point. E.g. if the padding would * start at cf.data[7] cf->len has to be 7 to be optimal. * Note: The data[] index starts with zero. */ if (cf->len <= CAN_MAX_DLEN) return (cf->len != start_index); /* This relation is also valid in the non-linear DLC range, where * we need to take care of the minimal next possible CAN_DL. * The correct check would be (padlen(cf->len) != padlen(start_index)). * But as cf->len can only take discrete values from 12, .., 64 at this * point the padlen(cf->len) is always equal to cf->len. */ return (cf->len != padlen(start_index)); } /* check padding and return 1/true when the check fails */ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, int start_index, u8 content) { int i; /* no RX_PADDING value => check length of optimized frame length */ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) return check_optimized(cf, start_index); /* no valid test against empty value => ignore frame */ return 1; } /* check datalength of correctly padded CAN frame */ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && cf->len != padlen(cf->len)) return 1; /* check padding content */ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { for (i = start_index; i < cf->len; i++) if (cf->data[i] != content) return 1; } return 0; } static void isotp_send_cframe(struct isotp_sock *so); static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) { struct sock *sk = &so->sk; if (so->tx.state != ISOTP_WAIT_FC && so->tx.state != ISOTP_WAIT_FIRST_FC) return 0; hrtimer_cancel(&so->txtimer); if ((cf->len < ae + FC_CONTENT_SZ) || ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return 1; } /* get static/dynamic communication params from first/every FC frame */ if (so->tx.state == ISOTP_WAIT_FIRST_FC || so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; /* fix wrong STmin values according spec */ if (so->txfc.stmin > 0x7F && (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) so->txfc.stmin = 0x7F; so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, so->force_tx_stmin); else if (so->txfc.stmin < 0x80) so->tx_gap = ktime_add_ns(so->tx_gap, so->txfc.stmin * 1000000); else so->tx_gap = ktime_add_ns(so->tx_gap, (so->txfc.stmin - 0xF0) * 100000); so->tx.state = ISOTP_WAIT_FC; } switch (cf->data[ae] & 0x0F) { case ISOTP_FC_CTS: so->tx.bs = 0; so->tx.state = ISOTP_SENDING; /* send CF frame and enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); break; case ISOTP_FC_WT: /* start timer to wait for next FC frame */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); break; case ISOTP_FC_OVFLW: /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); fallthrough; default: /* stop this tx job */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); } return 0; } static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, struct sk_buff *skb, int len) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; if (!len || len > cf->len - pcilen) return 1; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, len), &cf->data[pcilen], len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) { struct isotp_sock *so = isotp_sk(sk); int i; int off; int ff_pci_sz; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; /* get the used sender LL_DL from the (first) CAN frame data length */ so->rx.ll_dl = padlen(cf->len); /* the first frame has to use the entire frame up to LL_DL length */ if (cf->len != so->rx.ll_dl) return 1; /* get the FF_DL */ so->rx.len = (cf->data[ae] & 0x0F) << 8; so->rx.len += cf->data[ae + 1]; /* Check for FF_DL escape sequence supporting 32 bit PDU length */ if (so->rx.len) { ff_pci_sz = FF_PCI_SZ12; } else { /* FF_DL = 0 => get real length from next 4 bytes */ so->rx.len = cf->data[ae + 2] << 24; so->rx.len += cf->data[ae + 3] << 16; so->rx.len += cf->data[ae + 4] << 8; so->rx.len += cf->data[ae + 5]; ff_pci_sz = FF_PCI_SZ32; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) return 1; /* PDU size > default => try max_pdu_size */ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC); if (newbuf) { so->rx.buf = newbuf; so->rx.buflen = max_pdu_size; } } if (so->rx.len > so->rx.buflen) { /* send FC frame with overflow status */ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); return 1; } /* copy the first received data bytes */ so->rx.idx = 0; for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) so->rx.buf[so->rx.idx++] = cf->data[i]; /* initial setup for this pdu reception */ so->rx.sn = 1; so->rx.state = ISOTP_WAIT_DATA; /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* send our first FC frame */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, struct sk_buff *skb) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; int i; if (so->rx.state != ISOTP_WAIT_DATA) return 0; /* drop if timestamp gap is less than force_rx_stmin nano secs */ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < so->force_rx_stmin) return 0; so->lastrxcf_tstamp = skb->tstamp; } hrtimer_cancel(&so->rxtimer); /* CFs are never longer than the FF */ if (cf->len > so->rx.ll_dl) return 1; /* CFs have usually the LL_DL length */ if (cf->len < so->rx.ll_dl) { /* this is only allowed for the last CF */ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) return 1; } if ((cf->data[ae] & 0x0F) != so->rx.sn) { /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; return 1; } so->rx.sn++; so->rx.sn %= 16; for (i = ae + N_PCI_SZ; i < cf->len; i++) { so->rx.buf[so->rx.idx++] = cf->data[i]; if (so->rx.idx >= so->rx.len) break; } if (so->rx.idx >= so->rx.len) { /* we are done */ so->rx.state = ISOTP_IDLE; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, i + 1, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(so->rx.len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, so->rx.len), so->rx.buf, so->rx.len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* we reached the specified blocksize so->rxfc.bs */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static void isotp_rcv(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; u8 n_pci_type, sf_dl; /* Strictly receive only frames with the configured MTU size * => clear separation of CAN2.0 / CAN FD transport channels */ if (skb->len != so->ll.mtu) return; cf = (struct canfd_frame *)skb->data; /* if enabled: check reception of my configured extended address */ if (ae && cf->data[0] != so->opt.rx_ext_address) return; n_pci_type = cf->data[ae] & 0xF0; /* Make sure the state changes and data structures stay consistent at * CAN frame reception time. This locking is not needed in real world * use cases but the inconsistency can be triggered with syzkaller. */ spin_lock(&so->rx_lock); if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) goto out_unlock; } switch (n_pci_type) { case N_PCI_FC: /* tx path: flow control frame containing the FC parameters */ isotp_rcv_fc(so, cf, ae); break; case N_PCI_SF: /* rx path: single frame * * As we do not have a rx.ll_dl configuration, we can only test * if the CAN frames payload length matches the LL_DL == 8 * requirements - no matter if it's CAN 2.0 or CAN FD */ /* get the SF_DL from the N_PCI byte */ sf_dl = cf->data[ae] & 0x0F; if (cf->len <= CAN_MAX_DLEN) { isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); } else { if (can_is_canfd_skb(skb)) { /* We have a CAN FD frame and CAN_DL is greater than 8: * Only frames with the SF_DL == 0 ESC value are valid. * * If so take care of the increased SF PCI size * (SF_PCI_SZ8) to point to the message content behind * the extended SF PCI info and get the real SF_DL * length value from the formerly first data byte. */ if (sf_dl == 0) isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, cf->data[SF_PCI_SZ4 + ae]); } } break; case N_PCI_FF: /* rx path: first frame */ isotp_rcv_ff(sk, cf, ae); break; case N_PCI_CF: /* rx path: consecutive frame */ isotp_rcv_cf(sk, cf, ae, skb); break; } out_unlock: spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, int ae, int off) { int pcilen = N_PCI_SZ + ae + off; int space = so->tx.ll_dl - pcilen; int num = min_t(int, so->tx.len - so->tx.idx, space); int i; cf->can_id = so->txid; cf->len = num + pcilen; if (num < space) { if (so->opt.flags & CAN_ISOTP_TX_PADDING) { /* user requested padding */ cf->len = padlen(cf->len); memset(cf->data, so->opt.txpad_content, cf->len); } else if (cf->len > CAN_MAX_DLEN) { /* mandatory padding for CAN FD frames */ cf->len = padlen(cf->len); memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, cf->len); } } for (i = 0; i < num; i++) cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; if (ae) cf->data[0] = so->opt.ext_address; } static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) return; skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); if (!skb) { dev_put(dev); return; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* create consecutive frame */ isotp_fill_dataframe(cf, so, ae, 0); /* place consecutive frame N_PCI in appropriate index */ cf->data[ae] = N_PCI_CF | so->tx.sn++; so->tx.sn %= 16; so->tx.bs++; cf->flags = so->ll.tx_flags; skb->dev = dev; can_skb_set_owner(skb, sk); /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); /* set consecutive frame echo tag */ so->cfecho = *(u32 *)cf->data; /* send frame with local echo enabled */ can_send_ret = can_send(skb, 1); if (can_send_ret) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); if (can_send_ret == -ENOBUFS) pr_notice_once("can-isotp: tx queue is full\n"); } dev_put(dev); } static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { int i; int ff_pci_sz; cf->can_id = so->txid; cf->len = so->tx.ll_dl; if (ae) cf->data[0] = so->opt.ext_address; /* create N_PCI bytes with 12/32 bit FF_DL data length */ if (so->tx.len > MAX_12BIT_PDU_SIZE) { /* use 32 bit FF_DL notation */ cf->data[ae] = N_PCI_FF; cf->data[ae + 1] = 0; cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ32; } else { /* use 12 bit FF_DL notation */ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ12; } /* add first data bytes depending on ae */ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; } static void isotp_rcv_echo(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf = (struct canfd_frame *)skb->data; /* only handle my own local echo CF/SF skb's (no FF!) */ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) return; /* cancel local echo timeout */ hrtimer_cancel(&so->txtimer); /* local echo skb with consecutive frame has been consumed */ so->cfecho = 0; if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return; } if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { /* stop and wait for FC with timeout */ so->tx.state = ISOTP_WAIT_FC; hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return; } /* no gap between data frames needed => use burst mode */ if (!so->tx_gap) { /* enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); return; } /* start timer to send next consecutive frame with correct delay */ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; /* don't handle timeouts in IDLE or SHUTDOWN state */ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN) return HRTIMER_NORESTART; /* we did not get any flow control or echo frame in time */ /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return HRTIMER_NORESTART; } static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txfrtimer); /* start echo timeout handling and cover below protocol error */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); /* cfecho should be consumed by isotp_rcv_echo() here */ if (so->tx.state == ISOTP_SENDING && !so->cfecho) isotp_send_cframe(so); return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT; int off; int err; if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { /* we do not support multiple buffers - for now */ if (msg->msg_flags & MSG_DONTWAIT) return -EAGAIN; if (so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; } /* PDU size > default => try max_pdu_size */ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL); if (newbuf) { so->tx.buf = newbuf; so->tx.buflen = max_pdu_size; } } if (!size || size > so->tx.buflen) { err = -EINVAL; goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); goto err_out_drop; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; so->tx.len = size; so->tx.idx = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho); /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. * * SF_DL ESC offset optimization: * * When TX_DL is greater 8 but the message would still fit * into a 8 byte CAN frame, we can omit the offset. * This prevents a protocol caused length extension from * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. */ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) off = 0; isotp_fill_dataframe(cf, so, ae, off); /* place single frame N_PCI w/o length in appropriate index */ cf->data[ae] = N_PCI_SF; /* place SF_DL size value depending on the SF_DL ESC offset */ if (off) cf->data[SF_PCI_SZ4 + ae] = size; else cf->data[ae] |= size; /* set CF echo tag for isotp_rcv_echo() (SF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* send first frame */ isotp_create_fframe(cf, so, ae); if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { /* set timer for FC-less operation (STmin = 0) */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_set(0, so->force_tx_stmin); else so->tx_gap = ktime_set(0, so->frame_txtime); /* disable wait for FCs due to activated block size */ so->txfc.bs = 0; /* set CF echo tag for isotp_rcv_echo() (CF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* standard flow control check */ so->tx.state = ISOTP_WAIT_FIRST_FC; /* start timeout for FC */ hrtimer_sec = ISOTP_FC_TIMEOUT; /* no CF echo tag for isotp_rcv_echo() (FF-mode) */ so->cfecho = 0; } } hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), HRTIMER_MODE_REL_SOFT); /* send the first or only CAN frame */ cf->flags = so->ll.tx_flags; skb->dev = dev; skb->sk = sk; err = can_send(skb, 1); dev_put(dev); if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); /* no transmission -> no timeout monitoring */ hrtimer_cancel(&so->txtimer); /* reset consecutive frame echo tag */ so->cfecho = 0; goto err_out_drop; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; err = sock_error(sk); if (err) return err; } return size; err_event_drop: /* got signal: force tx state machine to be idle */ so->tx.state = ISOTP_IDLE; hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); err_out_drop: /* drop this PDU and unlock a potential wait queue */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT)) return -EINVAL; if (!so->bound) return -EADDRNOTAVAIL; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) goto out_err; sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(ISOTP_MIN_NAMELEN); msg->msg_namelen = ISOTP_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* set length of return value */ ret = (flags & MSG_TRUNC) ? skb->len : size; out_err: skb_free_datagram(sk, skb); return ret; } static int isotp_release(struct socket *sock) { struct sock *sk = sock->sk; struct isotp_sock *so; struct net *net; if (!sk) return 0; so = isotp_sk(sk); net = sock_net(sk); /* wait for complete transmission of current pdu */ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 && cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE) ; /* force state machines to be idle also when a signal occurred */ so->tx.state = ISOTP_SHUTDOWN; so->rx.state = ISOTP_IDLE; spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&isotp_notifier_lock); } list_del(&so->notifier); spin_unlock(&isotp_notifier_lock); lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { if (isotp_register_rxid(so)) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(net, dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } } } hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); so->ifindex = 0; so->bound = 0; if (so->rx.buf != so->rx.sbuf) kfree(so->rx.buf); if (so->tx.buf != so->tx.sbuf) kfree(so->tx.buf); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct net *net = sock_net(sk); int ifindex; struct net_device *dev; canid_t tx_id = addr->can_addr.tp.tx_id; canid_t rx_id = addr->can_addr.tp.rx_id; int err = 0; int notify_enetdown = 0; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else tx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (tx_id != addr->can_addr.tp.tx_id) return -EINVAL; /* sanitize rx CAN identifier (if needed) */ if (isotp_register_rxid(so)) { if (rx_id & CAN_EFF_FLAG) rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else rx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (rx_id != addr->can_addr.tp.rx_id) return -EINVAL; } if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); if (so->bound) { err = -EINVAL; goto out; } /* ensure different CAN IDs when the rx_id is to be registered */ if (isotp_register_rxid(so) && rx_id == tx_id) { err = -EADDRNOTAVAIL; goto out; } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { dev_put(dev); err = -ENODEV; goto out; } if (dev->mtu < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; if (isotp_register_rxid(so)) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); /* no consecutive frame echo skb in flight */ so->cfecho = 0; /* register for echo skb's */ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), isotp_rcv_echo, sk, "isotpe", sk); dev_put(dev); /* switch to new settings */ so->ifindex = ifindex; so->rxid = rx_id; so->txid = tx_id; so->bound = 1; out: release_sock(sk); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, ISOTP_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; addr->can_addr.tp.tx_id = so->txid; return ISOTP_MIN_NAMELEN; } static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (so->bound) return -EISCONN; switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) return -EINVAL; if (copy_from_sockptr(&so->opt, optval, optlen)) return -EFAULT; /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; /* these broadcast flags are not allowed together */ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { /* CAN_ISOTP_SF_BROADCAST is prioritized */ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; /* give user feedback on wrong config attempt */ ret = -EINVAL; } /* check for frame_txtime changes (0 => no changes) */ if (so->opt.frame_txtime) { if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) so->frame_txtime = 0; else so->frame_txtime = so->opt.frame_txtime; } break; case CAN_ISOTP_RECV_FC: if (optlen != sizeof(struct can_isotp_fc_options)) return -EINVAL; if (copy_from_sockptr(&so->rxfc, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_TX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_RX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_LL_OPTS: if (optlen == sizeof(struct can_isotp_ll_options)) { struct can_isotp_ll_options ll; if (copy_from_sockptr(&ll, optval, optlen)) return -EFAULT; /* check for correct ISO 11898-1 DLC data length */ if (ll.tx_dl != padlen(ll.tx_dl)) return -EINVAL; if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) return -EINVAL; if (ll.mtu == CAN_MTU && (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0)) return -EINVAL; memcpy(&so->ll, &ll, sizeof(ll)); /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = ll.tx_dl; } else { return -EINVAL; } break; default: ret = -ENOPROTOOPT; } return ret; } static int isotp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int ret; if (level != SOL_CAN_ISOTP) return -EINVAL; lock_sock(sk); ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen); release_sock(sk); return ret; } static int isotp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int len; void *val; if (level != SOL_CAN_ISOTP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_ISOTP_OPTS: len = min_t(int, len, sizeof(struct can_isotp_options)); val = &so->opt; break; case CAN_ISOTP_RECV_FC: len = min_t(int, len, sizeof(struct can_isotp_fc_options)); val = &so->rxfc; break; case CAN_ISOTP_TX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_tx_stmin; break; case CAN_ISOTP_RX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_rx_stmin; break; case CAN_ISOTP_LL_OPTS: len = min_t(int, len, sizeof(struct can_isotp_ll_options)); val = &so->ll; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void isotp_notify(struct isotp_sock *so, unsigned long msg, struct net_device *dev) { struct sock *sk = &so->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (so->ifindex != dev->ifindex) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (isotp_register_rxid(so)) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(dev_net(dev), dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); } so->ifindex = 0; so->bound = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int isotp_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&isotp_notifier_lock); list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) { spin_unlock(&isotp_notifier_lock); isotp_notify(isotp_busy_notifier, msg, dev); spin_lock(&isotp_notifier_lock); } isotp_busy_notifier = NULL; spin_unlock(&isotp_notifier_lock); return NOTIFY_DONE; } static int isotp_init(struct sock *sk) { struct isotp_sock *so = isotp_sk(sk); so->ifindex = 0; so->bound = 0; so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = so->ll.tx_dl; so->rx.state = ISOTP_IDLE; so->tx.state = ISOTP_IDLE; so->rx.buf = so->rx.sbuf; so->tx.buf = so->tx.sbuf; so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); spin_unlock(&isotp_notifier_lock); return 0; } static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); poll_wait(file, &so->wait, wait); /* Check for false positives due to TX state */ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE)) mask &= ~(EPOLLOUT | EPOLLWRNORM); return mask; } static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops isotp_ops = { .family = PF_CAN, .release = isotp_release, .bind = isotp_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = isotp_getname, .poll = isotp_poll, .ioctl = isotp_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = isotp_setsockopt, .getsockopt = isotp_getsockopt, .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, }; static struct proto isotp_proto __read_mostly = { .name = "CAN_ISOTP", .owner = THIS_MODULE, .obj_size = sizeof(struct isotp_sock), .init = isotp_init, }; static const struct can_proto isotp_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_ISOTP, .ops = &isotp_ops, .prot = &isotp_proto, }; static struct notifier_block canisotp_notifier = { .notifier_call = isotp_notifier }; static __init int isotp_module_init(void) { int err; max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE); max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE); pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size); err = can_proto_register(&isotp_can_proto); if (err < 0) pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); return err; } static __exit void isotp_module_exit(void) { can_proto_unregister(&isotp_can_proto); unregister_netdevice_notifier(&canisotp_notifier); } module_init(isotp_module_init); module_exit(isotp_module_exit); |
7 2 5 2 3 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6t_NPT.h> #include <linux/netfilter/x_tables.h> static int ip6t_npt_checkentry(const struct xt_tgchk_param *par) { struct ip6t_npt_tginfo *npt = par->targinfo; struct in6_addr pfx; __wsum src_sum, dst_sum; if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64) return -EINVAL; /* Ensure that LSB of prefix is zero */ ipv6_addr_prefix(&pfx, &npt->src_pfx.in6, npt->src_pfx_len); if (!ipv6_addr_equal(&pfx, &npt->src_pfx.in6)) return -EINVAL; ipv6_addr_prefix(&pfx, &npt->dst_pfx.in6, npt->dst_pfx_len); if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6)) return -EINVAL; src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0); dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0); npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum)); return 0; } static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt, struct in6_addr *addr) { unsigned int pfx_len; unsigned int i, idx; __be32 mask; __sum16 sum; pfx_len = max(npt->src_pfx_len, npt->dst_pfx_len); for (i = 0; i < pfx_len; i += 32) { if (pfx_len - i >= 32) mask = 0; else mask = htonl((1 << (i - pfx_len + 32)) - 1); idx = i / 32; addr->s6_addr32[idx] &= mask; addr->s6_addr32[idx] |= ~mask & npt->dst_pfx.in6.s6_addr32[idx]; } if (pfx_len <= 48) idx = 3; else { for (idx = 4; idx < ARRAY_SIZE(addr->s6_addr16); idx++) { if ((__force __sum16)addr->s6_addr16[idx] != CSUM_MANGLED_0) break; } if (idx == ARRAY_SIZE(addr->s6_addr16)) return false; } sum = ~csum_fold(csum_add(csum_unfold((__force __sum16)addr->s6_addr16[idx]), csum_unfold(npt->adjustment))); if (sum == CSUM_MANGLED_0) sum = 0; *(__force __sum16 *)&addr->s6_addr16[idx] = sum; return true; } static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb, struct ipv6hdr *_bounced_hdr) { if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) return NULL; if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type)) return NULL; return skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct icmp6hdr), sizeof(struct ipv6hdr), _bounced_hdr); } static unsigned int ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; struct ipv6hdr _bounced_hdr; struct ipv6hdr *bounced_hdr; struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, saddr)); return NF_DROP; } /* rewrite dst addr of bounced packet which was sent to dst range */ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); if (bounced_hdr) { ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len); if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) ip6t_npt_map_pfx(npt, &bounced_hdr->daddr); } return XT_CONTINUE; } static unsigned int ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; struct ipv6hdr _bounced_hdr; struct ipv6hdr *bounced_hdr; struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, daddr)); return NF_DROP; } /* rewrite src addr of bounced packet which was sent from dst range */ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); if (bounced_hdr) { ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len); if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) ip6t_npt_map_pfx(npt, &bounced_hdr->saddr); } return XT_CONTINUE; } static struct xt_target ip6t_npt_target_reg[] __read_mostly = { { .name = "SNPT", .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_POST_ROUTING), .me = THIS_MODULE, }, { .name = "DNPT", .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init ip6t_npt_init(void) { return xt_register_targets(ip6t_npt_target_reg, ARRAY_SIZE(ip6t_npt_target_reg)); } static void __exit ip6t_npt_exit(void) { xt_unregister_targets(ip6t_npt_target_reg, ARRAY_SIZE(ip6t_npt_target_reg)); } module_init(ip6t_npt_init); module_exit(ip6t_npt_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6-to-IPv6 Network Prefix Translation (RFC 6296)"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ip6t_SNPT"); MODULE_ALIAS("ip6t_DNPT"); |
3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H #include <linux/bio-integrity.h> #include <linux/blk-crypto.h> #include <linux/lockdep.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/sched/sysctl.h> #include <linux/timekeeping.h> #include <xen/xen.h> #include "blk-crypto-internal.h" struct elevator_type; /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) extern struct dentry *blk_debugfs_root; struct blk_flush_queue { spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; unsigned long flush_data_in_flight; struct request *flush_rq; }; bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); bool blk_queue_start_drain(struct request_queue *q); bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { rcu_read_lock(); if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter)) goto fail; /* * The code that increments the pm_only counter must ensure that the * counter is globally visible before the queue is unfrozen. */ if (blk_queue_pm_only(q) && (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) goto fail_put; rcu_read_unlock(); return true; fail_put: blk_queue_exit(q); fail: rcu_read_unlock(); return false; } static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (blk_try_enter_queue(q, false)) { rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; } return __bio_queue_enter(q, bio); } static inline void blk_wait_io(struct completion *done) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) while (!wait_for_completion_io_timeout(done, timeout)) ; else wait_for_completion_io(done); } #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, bool *same_page); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(vec1); phys_addr_t addr2 = bvec_phys(vec2); /* * Merging adjacent physical pages may not work correctly under KMSAN * if their metadata pages aren't adjacent. Just disable merging. */ if (IS_ENABLED(CONFIG_KMSAN)) return false; if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) return false; if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) return false; return true; } static inline bool __bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { return (offset & lim->virt_boundary_mask) || ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask); } /* * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ static inline bool bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { if (!lim->virt_boundary_mask) return false; return __bvec_gap_to_prev(lim, bprv, offset); } static inline bool rq_mergeable(struct request *rq) { if (blk_rq_is_passthrough(rq)) return false; if (req_op(rq) == REQ_OP_FLUSH) return false; if (req_op(rq) == REQ_OP_WRITE_ZEROES) return false; if (req_op(rq) == REQ_OP_ZONE_APPEND) return false; if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) return false; return true; } /* * There are two different ways to handle DISCARD merges: * 1) If max_discard_segments > 1, the driver treats every bio as a range and * send the bios to controller together. The ranges don't need to be * contiguous. * 2) Otherwise, the request will be normal read/write requests. The ranges * need to be contiguous. */ static inline bool blk_discard_mergable(struct request *req) { if (req_op(req) == REQ_OP_DISCARD && queue_max_discard_segments(req->q) > 1) return true; return false; } static inline unsigned int blk_rq_get_max_segments(struct request *rq) { if (req_op(rq) == REQ_OP_DISCARD) return queue_max_discard_segments(rq->q); return queue_max_segments(rq->q); } static inline unsigned int blk_queue_get_max_sectors(struct request *rq) { struct request_queue *q = rq->q; enum req_op op = req_op(rq); if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; if (rq->cmd_flags & REQ_ATOMIC) return q->limits.atomic_write_max_sectors; return q->limits.max_sectors; } #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); void bio_integrity_free(struct bio *bio); /* * Integrity payloads can either be owned by the submitter, in which case * bio_uninit will free them, or owned and generated by the block layer, * in which case we'll verify them here (for reads) and free them before * the bio is handed back to the submitted. */ bool __bio_integrity_endio(struct bio *bio); static inline bool bio_integrity_endio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip && (bip->bip_flags & BIP_BLOCK_INTEGRITY)) return __bio_integrity_endio(bio); return true; } bool blk_integrity_merge_rq(struct request_queue *, struct request *, struct request *); bool blk_integrity_merge_bio(struct request_queue *, struct request *, struct bio *); static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { struct bio_integrity_payload *bip = bio_integrity(req->bio); struct bio_integrity_payload *bip_next = bio_integrity(next); return bvec_gap_to_prev(&req->q->limits, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_payload *bip_next = bio_integrity(req->bio); return bvec_gap_to_prev(&req->q->limits, &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } extern const struct attribute_group blk_integrity_attr_group; #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) { return true; } static inline bool blk_integrity_merge_bio(struct request_queue *rq, struct request *r, struct bio *b) { return true; } static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { return false; } static inline bool integrity_req_gap_front_merge(struct request *req, struct bio *bio) { return false; } static inline void blk_flush_integrity(void) { } static inline bool bio_integrity_endio(struct bio *bio) { return true; } static inline void bio_integrity_free(struct bio *bio) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); enum bio_merge_status { BIO_MERGE_OK, BIO_MERGE_NONE, BIO_MERGE_FAILED, }; enum bio_merge_status bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); /* * Plug flush limits */ #define BLK_MAX_REQUEST_COUNT 32 #define BLK_PLUG_FLUSH_SIZE (128 * 1024) /* * Internal elevator interface */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) bool blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_disable(struct request_queue *q); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, unsigned *nsegs); struct bio *bio_split_write_zeroes(struct bio *bio, const struct queue_limits *lim, unsigned *nsegs); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); struct bio *bio_split_zone_append(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs); /* * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. * * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is * always valid if a bio has data. The check might lead to occasional false * positives when bios are cloned, but compared to the performance impact of * cloned bios themselves the loop below doesn't matter anyway. */ static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { return lim->chunk_sectors || bio->bi_vcnt != 1 || bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } /** * __bio_split_to_limits - split a bio to fit the queue limits * @bio: bio to be split * @lim: queue limits to split based on * @nr_segs: returns the number of segments in the returned bio * * Check if @bio needs splitting based on the queue limits, and if so split off * a bio fitting the limits from the beginning of @bio and return it. @bio is * shortened to the remainder and re-submitted. * * The split bio is allocated from @q->bio_split, which is provided by the * block layer. */ static inline struct bio *__bio_split_to_limits(struct bio *bio, const struct queue_limits *lim, unsigned int *nr_segs) { switch (bio_op(bio)) { case REQ_OP_READ: case REQ_OP_WRITE: if (bio_may_need_split(bio, lim)) return bio_split_rw(bio, lim, nr_segs); *nr_segs = 1; return bio; case REQ_OP_ZONE_APPEND: return bio_split_zone_append(bio, lim, nr_segs); case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: return bio_split_discard(bio, lim, nr_segs); case REQ_OP_WRITE_ZEROES: return bio_split_write_zeroes(bio, lim, nr_segs); default: /* other operations can't be split */ *nr_segs = 0; return bio; } } int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); int blk_set_default_limits(struct queue_limits *lim); void blk_apply_bdi_limits(struct backing_dev_info *bdi, struct queue_limits *lim); int blk_dev_init(void); void update_io_ticks(struct block_device *part, unsigned long now, bool end); unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; } /* * Internal io_context interface */ struct io_cq *ioc_find_get_icq(struct request_queue *q); struct io_cq *ioc_lookup_icq(struct request_queue *q); #ifdef CONFIG_BLK_ICQ void ioc_clear_queue(struct request_queue *q); #else static inline void ioc_clear_queue(struct request_queue *q) { } #endif /* CONFIG_BLK_ICQ */ struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); static inline bool blk_queue_may_bounce(struct request_queue *q) { return IS_ENABLED(CONFIG_BOUNCE) && (q->limits.features & BLK_FEAT_BOUNCE_HIGH) && max_low_pfn >= max_pfn; } static inline struct bio *blk_queue_bounce(struct bio *bio, struct request_queue *q) { if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) return __blk_queue_bounce(bio, q); return bio; } #ifdef CONFIG_BLK_DEV_ZONED void disk_init_zone_resources(struct gendisk *disk); void disk_free_zone_resources(struct gendisk *disk); static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } void blk_zone_write_plug_bio_merged(struct bio *bio); void blk_zone_write_plug_init_request(struct request *rq); static inline void blk_zone_update_request_bio(struct request *rq, struct bio *bio) { /* * For zone append requests, the request sector indicates the location * at which the BIO data was written. Return this value to the BIO * issuer through the BIO iter sector. * For plugged zone writes, which include emulated zone append, we need * the original BIO sector so that blk_zone_write_plug_bio_endio() can * lookup the zone write plug. */ if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) bio->bi_iter.bi_sector = rq->__sector; } void blk_zone_write_plug_bio_endio(struct bio *bio); static inline void blk_zone_bio_endio(struct bio *bio) { /* * For write BIOs to zoned devices, signal the completion of the BIO so * that the next write BIO can be submitted by zone write plugging. */ if (bio_zone_write_plugging(bio)) blk_zone_write_plug_bio_endio(bio); } void blk_zone_write_plug_finish_request(struct request *rq); static inline void blk_zone_finish_request(struct request *rq) { if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING) blk_zone_write_plug_finish_request(rq); } int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_init_zone_resources(struct gendisk *disk) { } static inline void disk_free_zone_resources(struct gendisk *disk) { } static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } static inline void blk_zone_write_plug_init_request(struct request *rq) { } static inline void blk_zone_update_request_bio(struct request *rq, struct bio *bio) { } static inline void blk_zone_bio_endio(struct bio *bio) { } static inline void blk_zone_finish_request(struct request *rq) { } static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { return -ENOTTY; } static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; } #endif /* CONFIG_BLK_DEV_ZONED */ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); void bdev_unhash(struct block_device *bdev); void bdev_drop(struct block_device *bdev); int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 #define ADDPART_FLAG_READONLY 4 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); void drop_partition(struct block_device *part); void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); int bio_add_hw_folio(struct request_queue *q, struct bio *bio, struct folio *folio, size_t len, size_t offset, unsigned int max_sectors, bool *same_page); /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. */ static inline void bio_release_page(struct bio *bio, struct page *page) { if (bio_flagged(bio, BIO_PAGE_PINNED)) unpin_user_page(page); } struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); void disk_block_events(struct gendisk *disk); void disk_unblock_events(struct gendisk *disk); void disk_flush_events(struct gendisk *disk, unsigned int mask); extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; extern struct attribute_group blk_trace_attr_group; blk_mode_t file_to_blk_mode(struct file *file); int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); extern const struct address_space_operations def_blk_aops; int disk_register_independent_access_ranges(struct gendisk *disk); void disk_unregister_independent_access_ranges(struct gendisk *disk); #ifdef CONFIG_FAIL_MAKE_REQUEST bool should_fail_request(struct block_device *part, unsigned int bytes); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool should_fail_request(struct block_device *part, unsigned int bytes) { return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ /* * Optimized request reference counting. Ideally we'd make timeouts be more * clever, as that's the only reason we need references at all... But until * this happens, this is faster than using refcount_t. Also see: * * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count") */ #define req_ref_zero_or_close_to_overflow(req) \ ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u) static inline bool req_ref_inc_not_zero(struct request *req) { return atomic_inc_not_zero(&req->ref); } static inline bool req_ref_put_and_test(struct request *req) { WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); return atomic_dec_and_test(&req->ref); } static inline void req_ref_set(struct request *req, int value) { atomic_set(&req->ref, value); } static inline int req_ref_read(struct request *req) { return atomic_read(&req->ref); } static inline u64 blk_time_get_ns(void) { struct blk_plug *plug = current->plug; if (!plug || !in_task()) return ktime_get_ns(); /* * 0 could very well be a valid time, but rather than flag "this is * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS; } return plug->cur_ktime; } static inline ktime_t blk_time_get(void) { return ns_to_ktime(blk_time_get_ns()); } /* * From most significant bit: * 1 bit: reserved for other usage, see below * 12 bits: original size of bio * 51 bits: issue time of bio */ #define BIO_ISSUE_RES_BITS 1 #define BIO_ISSUE_SIZE_BITS 12 #define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) #define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) #define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) #define BIO_ISSUE_SIZE_MASK \ (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) #define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) /* Reserved bit for blk-throtl */ #define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) static inline u64 __bio_issue_time(u64 time) { return time & BIO_ISSUE_TIME_MASK; } static inline u64 bio_issue_time(struct bio_issue *issue) { return __bio_issue_time(issue->value); } static inline sector_t bio_issue_size(struct bio_issue *issue) { return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); } static inline void bio_issue_init(struct bio_issue *issue, sector_t size) { size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | ((u64)size << BIO_ISSUE_SIZE_SHIFT)); } void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); void blk_integrity_generate(struct bio *bio); void blk_integrity_verify(struct bio *bio); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); static inline void blk_freeze_acquire_lock(struct request_queue *q, bool disk_dead, bool queue_dying) { if (!disk_dead) rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); if (!queue_dying) rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); } static inline void blk_unfreeze_release_lock(struct request_queue *q, bool disk_dead, bool queue_dying) { if (!queue_dying) rwsem_release(&q->q_lockdep_map, _RET_IP_); if (!disk_dead) rwsem_release(&q->io_lockdep_map, _RET_IP_); } #endif /* BLK_INTERNAL_H */ |
347 352 276 3 70 347 345 2 1 347 350 351 346 348 348 14 53 22 26 2 2 408 408 408 3 15 408 254 70 66 409 404 9 9 394 391 366 100 96 97 30 388 385 51 352 431 12 1 425 422 4 417 415 1 408 1 408 5 407 22 388 19 18 18 18 10 9 19 19 19 6 17 3 18 18 4 17 19 19 19 27 27 26 8 7 19 19 8 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code * (just stops a compiler warning). * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes * are junked rather than corrupting things. * Alan Cox : Frames to bad broadcast subnets are dumped * We used to process them non broadcast and * boy could that cause havoc. * Alan Cox : ip_forward sets the free flag on the * new frame it queues. Still crap because * it copies the frame but at least it * doesn't eat memory too. * Alan Cox : Generic queue code and memory fixes. * Fred Van Kempen : IP fragment support (borrowed from NET2E) * Gerhard Koerting: Forward fragmented frames correctly. * Gerhard Koerting: Fixes to my fix of the above 8-). * Gerhard Koerting: IP interface addressing fix. * Linus Torvalds : More robustness checks * Alan Cox : Even more checks: Still not as robust as it ought to be * Alan Cox : Save IP header pointer for later * Alan Cox : ip option setting * Alan Cox : Use ip_tos/ip_ttl settings * Alan Cox : Fragmentation bogosity removed * (Thanks to Mark.Bush@prg.ox.ac.uk) * Dmitry Gorodchanin : Send of a raw packet crash fix. * Alan Cox : Silly ip bug when an overlength * fragment turns up. Now frees the * queue. * Linus Torvalds/ : Memory leakage on fragmentation * Alan Cox : handling. * Gerhard Koerting: Forwarding uses IP priority hints * Teemu Rantanen : Fragment problems. * Alan Cox : General cleanup, comments and reformat * Alan Cox : SNMP statistics * Alan Cox : BSD address rule semantics. Also see * UDP as there is a nasty checksum issue * if you do things the wrong way. * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file * Alan Cox : IP options adjust sk->priority. * Pedro Roque : Fix mtu/length error in ip_forward. * Alan Cox : Avoid ip_chk_addr when possible. * Richard Underwood : IP multicasting. * Alan Cox : Cleaned up multicast handlers. * Alan Cox : RAW sockets demultiplex in the BSD style. * Gunther Mayer : Fix the SNMP reporting typo * Alan Cox : Always in group 224.0.0.1 * Pauline Middelink : Fast ip_checksum update when forwarding * Masquerading support. * Alan Cox : Multicast loopback error for 224.0.0.1 * Alan Cox : IP_MULTICAST_LOOP option. * Alan Cox : Use notifiers. * Bjorn Ekwall : Removed ip_csum (from slhc.c too) * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) * Stefan Becker : Send out ICMP HOST REDIRECT * Arnt Gulbrandsen : ip_build_xmit * Alan Cox : Per socket routing cache * Alan Cox : Fixed routing cache, added header cache. * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. * Alan Cox : Incoming IP option handling. * Alan Cox : Set saddr on raw output frames as per BSD. * Alan Cox : Stopped broadcast source route explosions. * Alan Cox : Can disable source routing * Takeshi Sone : Masquerading didn't work. * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. * Alan Cox : Memory leaks, tramples, misc debugging. * Alan Cox : Fixed multicast (by popular demand 8)) * Alan Cox : Fixed forwarding (by even more popular demand 8)) * Alan Cox : Fixed SNMP statistics [I think] * Gerhard Koerting : IP fragmentation forwarding fix * Alan Cox : Device lock against page fault. * Alan Cox : IP_HDRINCL facility. * Werner Almesberger : Zero fragment bug * Alan Cox : RAW IP frame length bug * Alan Cox : Outgoing firewall on build_xmit * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit * the allocation of a buffer that can then be 'grown' by twiddling page tables. * Output fragmentation wants updating along with the buffer management to use a single * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> #include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) */ bool ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; struct net *net = dev_net(dev); for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report * the packet if it came from that interface. */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) raw_rcv(last, skb2); } last = sk; } } if (last) { raw_rcv(last, skb); return true; } return false; } INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; int raw, ret; resubmit: raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); if (ret < 0) { protocol = -ret; goto resubmit; } __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; } /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt; const struct iphdr *iph; /* It looks as overkill, because not all IP options require packet mangling. But it is the easiest for now, especially taking into account that combination of IP options and running sniffer is extremely rare condition. --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } if (unlikely(opt->srr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev)) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); goto drop; } } if (ip_options_rcv_srr(skb, dev)) goto drop; } return false; drop: return true; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && ip_hdr(hint)->tos == iph->tos; } int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int err, drop_reason; struct rtable *rt; if (ip_can_use_hint(skb, iph, hint)) { drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev, hint); if (unlikely(drop_reason)) goto drop_error; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { tcp_v4_early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { err = udp_v4_early_demux(skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb, dev)) goto drop; rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * * When a host sends a datagram to a link-layer broadcast * address, the IP destination address MUST be a legal IP * broadcast or IP multicast address. * * A host SHOULD silently discard a datagram that is received * via a link-layer broadcast (see Section 2.4) but does not * specify an IP multicast or broadcast destination address. * * This doesn't explicitly say L2 *broadcast*, but broadcast is * in a way a form of multicast and the most common use case for * this is 802.11 protecting against cross-station spoofing (the * so-called "hole-196" attack) so do it for both. */ if (in_dev && IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; } } return NET_RX_SUCCESS; drop: kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS; ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } /* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { dev_core_stats_rx_otherhost_dropped_inc(skb->dev); drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; csum_error: drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb_reason(skb, drop_reason); out: return NULL; } /* * IP receive entry point */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); } static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; } static void ip_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) continue; if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); ip_list_rcv_finish(net, NULL, head); } /* Receive a list of IP packets */ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); } |
29 29 1 29 4 1 15 3 22 24 24 24 1 5 15 15 3 2 1 30 1 1 32 32 2 29 30 6 1 1 24 17 2 4 2 3 2 1 4 23 5 2 2 2 1 3 1 16 22 9 16 16 16 15 1 2 2 2 4 21 21 1 22 20 2 2 5 5 5 2 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 | // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for SCTP. * * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_NONE] = "NONE", [SCTP_CONNTRACK_CLOSED] = "CLOSED", [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = 10 SECS, [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT #define sCE SCTP_CONNTRACK_COOKIE_ECHOED #define sES SCTP_CONNTRACK_ESTABLISHED #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sIV SCTP_CONNTRACK_MAX /* These are the descriptions of the states: NOTE: These state names are tantalizingly similar to the states of an SCTP endpoint. But the interpretation of the states is a little different, considering that these are the states of the connection and not of an end point. Please note the subtleties. -Kiran NONE - Nothing so far. COOKIE WAIT - We have seen an INIT chunk in the original direction, or also an INIT_ACK chunk in the reply direction. COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. */ /* TODO - I have assumed that the first INIT is in the original direction. This messes things when an INIT comes in the reply direction in CLOSED state. - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - Full Multi Homing support. */ /* SCTP conntrack state transitions */ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ /* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ /* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, /* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #endif /* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned long *map, const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; int flag; flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { nf_ct_l4proto_log_invalid(skb, ct, state, "%s failed. chunk num %d, type %d, len %d flag %d\n", __func__, count, sch->type, sch->length, flag); return 1; } if (map) set_bit(sch->type, map); } return count == 0; } static int sctp_new_state(enum ip_conntrack_dir dir, enum sctp_conntrack cur_state, int chunk_type) { int i; switch (chunk_type) { case SCTP_CID_INIT: i = 0; break; case SCTP_CID_INIT_ACK: i = 1; break; case SCTP_CID_ABORT: i = 2; break; case SCTP_CID_SHUTDOWN: i = 3; break; case SCTP_CID_SHUTDOWN_ACK: i = 4; break; case SCTP_CID_ERROR: i = 5; break; case SCTP_CID_COOKIE_ECHO: i = 6; break; case SCTP_CID_COOKIE_ACK: i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: i = 8; break; case SCTP_CID_HEARTBEAT: i = 9; break; case SCTP_CID_HEARTBEAT_ACK: i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type %d, Will stay in %s\n", chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } return sctp_conntracks[dir][i][cur_state]; } /* Don't need lock here: this conntrack not in circulation yet */ static noinline bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct sctphdr *sh, unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u32 offset, count; memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ if (new_state == SCTP_CONNTRACK_NONE || new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return false; } /* Copy the vtag into the state info */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _inithdr, *ih; /* Sec 8.5.1 (A) */ if (sh->vtag) return false; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(_inithdr), &_inithdr); if (!ih) return false; pr_debug("Setting vtag %x for new conn\n", ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; } static bool sctp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { const struct sctphdr *sh; const char *logmsg; if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; } skb->ip_summed = CHECKSUM_UNNECESSARY; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); const struct sctphdr *sh; struct sctphdr _sctph; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ if (test_bit(SCTP_CID_ABORT, map) || test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || test_bit(SCTP_CID_COOKIE_ACK, map)) return -NF_ACCEPT; if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; } /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { nf_ct_l4proto_log_invalid(skb, ct, state, "verification tag check failed %x vs %x for dir %d", sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* (B) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* (C) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ACK) { ct->proto.sctp.init[dir] = 0; ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.last_dir = dir; ignore = true; continue; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || ct->proto.sctp.last_dir == dir) goto out_unlock; ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.vtag[dir] = sh->vtag; ct->proto.sctp.vtag[!dir] = 0; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } old_state = ct->proto.sctp.state; new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { nf_ct_l4proto_log_invalid(skb, ct, state, "Invalid, old_state %d, dir %d, type %d", old_state, dir, sch->type); goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _ih, *ih; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) ct->proto.sctp.init[!dir] = 0; ct->proto.sctp.init[dir] = 1; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so * port reuse by client or NAT middlebox cannot * keep entry alive indefinitely (incl. nat info). */ if (new_state == SCTP_CONNTRACK_CLOSED && old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; } else if (sch->type == SCTP_CID_INIT_ACK) { struct sctp_inithdr _ih, *ih; __be32 vtag; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; vtag = ct->proto.sctp.vtag[!dir]; if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) goto out_unlock; /* collision */ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && vtag != ih->init_tag) goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (new_state == SCTP_CONNTRACK_ESTABLISHED && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } } spin_unlock_bh(&ct->lock); /* allow but do not refresh timeout */ if (ignore) return NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; out_unlock: spin_unlock_bh(&ct->lock); out: return -NF_ACCEPT; } static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { case SCTP_CONNTRACK_SHUTDOWN_SENT: case SCTP_CONNTRACK_SHUTDOWN_RECD: case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; #define SCTP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 4) + \ NLA_ALIGN(NLA_HDRLEN + 4)) static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; int err; /* updates may not contain the internal protocol info, skip parsing */ if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, sctp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_SCTP_STATE] || !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) return -EINVAL; spin_lock_bh(&ct->lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; if (!timeouts) timeouts = sn->timeouts; /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } static int sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; /* timeouts[0] is unused, init it so ->timeouts[0] contains * 'new' timeout, like udp or icmp. */ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .can_early_drop = sctp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_SCTP_MAX, .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
21 36 20 7 4 27 1 2 14 2 2 13 1 8 38 9 3 65 66 65 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GRO_H #define _NET_GRO_H #include <linux/indirect_call_wrapper.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <linux/skbuff.h> #include <net/udp.h> #include <net/hotdata.h> struct napi_gro_cb { union { struct { /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ void *frag0; /* Length of frag0. */ unsigned int frag0_len; }; struct { /* used in skb_gro_receive() slow path */ struct sk_buff *last; /* jiffies when first packet was created/queued */ unsigned long age; }; }; /* This indicates where we are processing relative to skb->data. */ int data_offset; /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush; /* Number of segments aggregated. */ u16 count; /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto; u16 pad; /* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 /* portion of the cb set to zero at every gro iteration */ struct_group(zeroed, /* Start offset for remote checksum offload */ u16 gro_remcsum_start; /* This is non-zero if the packet may be of the same flow. */ u8 same_flow:1; /* Used in tunnel GRO receive */ u8 encap_mark:1; /* GRO checksum is valid */ u8 csum_valid:1; /* Number of checksums via CHECKSUM_UNNECESSARY */ u8 csum_cnt:3; /* Free the skb? */ u8 free:2; /* Used in foo-over-udp, set in udp[46]_gro_receive */ u8 is_ipv6:1; /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ u8 ip_fixedid:1; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; /* GRO is done by frag_list pointer chaining. */ u8 is_flist:1; ); /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; /* L3 offsets */ union { struct { u16 network_offset; u16 inner_network_offset; }; u16 network_offsets[2]; }; }; #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) #define GRO_RECURSION_LIMIT 15 static inline int gro_recursion_inc_test(struct sk_buff *skb) { return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive(gro_receive_t cb, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(head, skb); } typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, struct sk_buff *); static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, struct sock *sk, struct list_head *head, struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; return NULL; } return cb(sk, head, skb); } static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->data_offset; } static inline unsigned int skb_gro_len(const struct sk_buff *skb) { return skb->len - NAPI_GRO_CB(skb)->data_offset; } static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) { NAPI_GRO_CB(skb)->data_offset += len; } static inline void *skb_gro_header_fast(const struct sk_buff *skb, unsigned int offset) { return NAPI_GRO_CB(skb)->frag0 + offset; } static inline bool skb_gro_may_pull(const struct sk_buff *skb, unsigned int hlen) { return likely(hlen <= NAPI_GRO_CB(skb)->frag0_len); } static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; return skb->data + offset; } static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { void *ptr; ptr = skb_gro_header_fast(skb, offset); if (!skb_gro_may_pull(skb, hlen)) ptr = skb_gro_header_slow(skb, hlen, offset); return ptr; } static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) { return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; } static inline void *skb_gro_network_header(const struct sk_buff *skb) { if (skb_gro_may_pull(skb, skb_gro_offset(skb))) return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb)); return skb->data + skb_gro_receive_network_offset(skb); } static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct iphdr *iph = skb_gro_network_header(skb); return csum_tcpudp_nofold(iph->saddr, iph->daddr, skb_gro_len(skb), proto, 0); } static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (NAPI_GRO_CB(skb)->csum_valid) NAPI_GRO_CB(skb)->csum = wsum_negate(csum_partial(start, len, wsum_negate(NAPI_GRO_CB(skb)->csum))); } /* GRO checksum functions. These are logical equivalents of the normal * checksum functions (in skbuff.h) except that they operate on the GRO * offsets and fields in sk_buff. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb); static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); } static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { return ((skb->ip_summed != CHECKSUM_PARTIAL || skb_checksum_start_offset(skb) < skb_gro_offset(skb)) && !skb_at_gro_remcsum_start(skb) && NAPI_GRO_CB(skb)->csum_cnt == 0 && (!zero_okay || check)); } static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, __wsum psum) { if (NAPI_GRO_CB(skb)->csum_valid && !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) return 0; NAPI_GRO_CB(skb)->csum = psum; return __skb_gro_checksum_complete(skb); } static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->csum_cnt > 0) { /* Consume a checksum from CHECKSUM_UNNECESSARY */ NAPI_GRO_CB(skb)->csum_cnt--; } else { /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we * verified a new top level checksum or an encapsulated one * during GRO. This saves work if we fallback to normal path. */ __skb_incr_checksum_unnecessary(skb); } } #define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ compute_pseudo) \ ({ \ __sum16 __ret = 0; \ if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_gro_checksum_validate_complete(skb, \ compute_pseudo(skb, proto)); \ if (!__ret) \ skb_gro_incr_csum_unnecessary(skb); \ __ret; \ }) #define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) #define skb_gro_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) #define skb_gro_checksum_simple_validate(skb) \ __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid); } static inline void __skb_gro_checksum_convert(struct sk_buff *skb, __wsum pseudo) { NAPI_GRO_CB(skb)->csum = ~pseudo; NAPI_GRO_CB(skb)->csum_valid = 1; } #define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_gro_checksum_convert_check(skb)) \ __skb_gro_checksum_convert(skb, \ compute_pseudo(skb, proto)); \ } while (0) struct gro_remcsum { int offset; __wsum delta; }; static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) { grc->offset = 0; grc->delta = 0; } static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, unsigned int off, size_t hdrlen, int start, int offset, struct gro_remcsum *grc, bool nopartial) { __wsum delta; size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); if (!nopartial) { NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; return ptr; } ptr = skb_gro_header(skb, off + plen, off); if (!ptr) return NULL; delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, start, offset); /* Adjust skb->csum since we changed the packet */ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); grc->offset = off + hdrlen + offset; grc->delta = delta; return ptr; } static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, struct gro_remcsum *grc) { void *ptr; size_t plen = grc->offset + sizeof(u16); if (!grc->delta) return; ptr = skb_gro_header(skb, plen, grc->offset); if (!ptr) return; remcsum_unadjust((__sum16 *)ptr, grc->delta); } #ifdef CONFIG_XFRM_OFFLOAD static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { if (PTR_ERR(pp) != -EINPROGRESS) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } } #else static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, struct sk_buff *pp, int flush, struct gro_remcsum *grc) { NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, grc); skb->remcsum_offload = 0; } #endif INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); #define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_INET(cb, f2, f1, head, skb); \ }) struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header(skb, hlen, off); return uh; } static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, skb_gro_len(skb), proto, 0)); } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, struct sk_buff *p, bool outer) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; const u32 df = id & IP_DF; int flush; /* All fields must match except length and checksum. */ flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); if (flush | (outer && df)) return flush; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ if (count == 1 && df && !ipid_offset) NAPI_GRO_CB(p)->ip_fixedid = true; return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) { /* <Version:4><Traffic_Class:8><Flow_Label:20> */ __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* Flush if Traffic Class fields are different. */ return !!((first_word & htonl(0x0FF00000)) | (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); } static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, bool outer) { const void *nh = th - diff; const void *nh2 = th2 - diff; if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else return inet_gro_flush(nh, nh2, p, outer); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; int off = skb_transport_offset(p); int flush; flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); if (encap_mark) flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); return flush; } int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ static inline void gro_normal_list(struct napi_struct *napi) { if (!napi->rx_count) return; netif_receive_skb_list_internal(&napi->rx_list); INIT_LIST_HEAD(&napi->rx_list); napi->rx_count = 0; } /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &napi->rx_list); napi->rx_count += segs; if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) gro_normal_list(napi); } /* This function is the alternative of 'inet_iif' and 'inet_sdif' * functions in case we can not rely on fields of IPCB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { *iif = inet_iif(skb) ?: skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } /* This function is the alternative of 'inet6_iif' and 'inet6_sdif' * functions in case we can not rely on fields of IP6CB. * * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. * The caller must hold the RCU read lock. */ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) { /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ *iif = skb->dev->ifindex; *sdif = 0; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (netif_is_l3_slave(skb->dev)) { struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); *sdif = *iif; *iif = master ? master->ifindex : 0; } #endif } struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); #endif /* _NET_GRO_H */ |
11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/mm.h> #include <asm/current.h> #include <asm/traps.h> #include <asm/vdso.h> struct vdso_exception_table_entry { int insn, fixup; }; bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct vdso_image *image = current->mm->context.vdso_image; const struct vdso_exception_table_entry *extable; unsigned int nr_entries, i; unsigned long base; /* * Do not attempt to fixup #DB or #BP. It's impossible to identify * whether or not a #DB/#BP originated from within an SGX enclave and * SGX enclaves are currently the only use case for vDSO fixup. */ if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP) return false; if (!current->mm->context.vdso) return false; base = (unsigned long)current->mm->context.vdso + image->extable_base; nr_entries = image->extable_len / (sizeof(*extable)); extable = image->extable; for (i = 0; i < nr_entries; i++) { if (regs->ip == base + extable[i].insn) { regs->ip = base + extable[i].fixup; regs->di = trapnr; regs->si = error_code; regs->dx = fault_addr; return true; } } return false; } |
6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Credential hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/cred.h> #include <linux/lsm_hooks.h> #include "common.h" #include "cred.h" #include "ruleset.h" #include "setup.h" static void hook_cred_transfer(struct cred *const new, const struct cred *const old) { struct landlock_ruleset *const old_dom = landlock_cred(old)->domain; if (old_dom) { landlock_get_ruleset(old_dom); landlock_cred(new)->domain = old_dom; } } static int hook_cred_prepare(struct cred *const new, const struct cred *const old, const gfp_t gfp) { hook_cred_transfer(new, old); return 0; } static void hook_cred_free(struct cred *const cred) { struct landlock_ruleset *const dom = landlock_cred(cred)->domain; if (dom) landlock_put_ruleset_deferred(dom); } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_transfer, hook_cred_transfer), LSM_HOOK_INIT(cred_free, hook_cred_free), }; __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); } |
275 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic address resolution entity * * Authors: * net_random Alan Cox * net_ratelimit Andi Kleen * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project * * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/mm.h> #include <linux/net.h> #include <linux/string.h> #include <linux/types.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/ratelimit.h> #include <linux/socket.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/uaccess.h> DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); /* * All net warning printk()s should be guarded by this function. */ int net_ratelimit(void) { return __ratelimit(&net_ratelimit_state); } EXPORT_SYMBOL(net_ratelimit); /* * Convert an ASCII string to binary IP. * This is outside of net/ipv4/ because various code that uses IP addresses * is otherwise not dependent on the TCP/IP stack. */ __be32 in_aton(const char *str) { unsigned int l; unsigned int val; int i; l = 0; for (i = 0; i < 4; i++) { l <<= 8; if (*str != '\0') { val = 0; while (*str != '\0' && *str != '.' && *str != '\n') { val *= 10; val += *str - '0'; str++; } l |= val; if (*str != '\0') str++; } } return htonl(l); } EXPORT_SYMBOL(in_aton); #define IN6PTON_XDIGIT 0x00010000 #define IN6PTON_DIGIT 0x00020000 #define IN6PTON_COLON_MASK 0x00700000 #define IN6PTON_COLON_1 0x00100000 /* single : requested */ #define IN6PTON_COLON_2 0x00200000 /* second : requested */ #define IN6PTON_COLON_1_2 0x00400000 /* :: requested */ #define IN6PTON_DOT 0x00800000 /* . */ #define IN6PTON_DELIM 0x10000000 #define IN6PTON_NULL 0x20000000 /* first/tail */ #define IN6PTON_UNKNOWN 0x40000000 static inline int xdigit2bin(char c, int delim) { int val; if (c == delim || c == '\0') return IN6PTON_DELIM; if (c == ':') return IN6PTON_COLON_MASK; if (c == '.') return IN6PTON_DOT; val = hex_to_bin(c); if (val >= 0) return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0); if (delim == -1) return IN6PTON_DELIM; return IN6PTON_UNKNOWN; } /** * in4_pton - convert an IPv4 address from literal to binary representation * @src: the start of the IPv4 address string * @srclen: the length of the string, -1 means strlen(src) * @dst: the binary (u8[4] array) representation of the IPv4 address * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter * @end: A pointer to the end of the parsed string will be placed here * * Return one on success, return zero when any error occurs * and @end will point to the end of the parsed string. * */ int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end) { const char *s; u8 *d; u8 dbuf[4]; int ret = 0; int i; int w = 0; if (srclen < 0) srclen = strlen(src); s = src; d = dbuf; i = 0; while (1) { int c; c = xdigit2bin(srclen > 0 ? *s : '\0', delim); if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { goto out; } if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) { if (w == 0) goto out; *d++ = w & 0xff; w = 0; i++; if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { if (i != 4) goto out; break; } goto cont; } w = (w * 10) + c; if ((w & 0xffff) > 255) { goto out; } cont: if (i >= 4) goto out; s++; srclen--; } ret = 1; memcpy(dst, dbuf, sizeof(dbuf)); out: if (end) *end = s; return ret; } EXPORT_SYMBOL(in4_pton); /** * in6_pton - convert an IPv6 address from literal to binary representation * @src: the start of the IPv6 address string * @srclen: the length of the string, -1 means strlen(src) * @dst: the binary (u8[16] array) representation of the IPv6 address * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter * @end: A pointer to the end of the parsed string will be placed here * * Return one on success, return zero when any error occurs * and @end will point to the end of the parsed string. * */ int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end) { const char *s, *tok = NULL; u8 *d, *dc = NULL; u8 dbuf[16]; int ret = 0; int i; int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL; int w = 0; memset(dbuf, 0, sizeof(dbuf)); s = src; d = dbuf; if (srclen < 0) srclen = strlen(src); while (1) { int c; c = xdigit2bin(srclen > 0 ? *s : '\0', delim); if (!(c & state)) goto out; if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { /* process one 16-bit word */ if (!(state & IN6PTON_NULL)) { *d++ = (w >> 8) & 0xff; *d++ = w & 0xff; } w = 0; if (c & IN6PTON_DELIM) { /* We've processed last word */ break; } /* * COLON_1 => XDIGIT * COLON_2 => XDIGIT|DELIM * COLON_1_2 => COLON_2 */ switch (state & IN6PTON_COLON_MASK) { case IN6PTON_COLON_2: dc = d; state = IN6PTON_XDIGIT | IN6PTON_DELIM; if (dc - dbuf >= sizeof(dbuf)) state |= IN6PTON_NULL; break; case IN6PTON_COLON_1|IN6PTON_COLON_1_2: state = IN6PTON_XDIGIT | IN6PTON_COLON_2; break; case IN6PTON_COLON_1: state = IN6PTON_XDIGIT; break; case IN6PTON_COLON_1_2: state = IN6PTON_COLON_2; break; default: state = 0; } tok = s + 1; goto cont; } if (c & IN6PTON_DOT) { ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s); if (ret > 0) { d += 4; break; } goto out; } w = (w << 4) | (0xff & c); state = IN6PTON_COLON_1 | IN6PTON_DELIM; if (!(w & 0xf000)) { state |= IN6PTON_XDIGIT; } if (!dc && d + 2 < dbuf + sizeof(dbuf)) { state |= IN6PTON_COLON_1_2; state &= ~IN6PTON_DELIM; } if (d + 2 >= dbuf + sizeof(dbuf)) { state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2); } cont: if ((dc && d + 4 < dbuf + sizeof(dbuf)) || d + 4 == dbuf + sizeof(dbuf)) { state |= IN6PTON_DOT; } if (d >= dbuf + sizeof(dbuf)) { state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK); } s++; srclen--; } i = 15; d--; if (dc) { while (d >= dc) dst[i--] = *d--; while (i >= dc - dbuf) dst[i--] = 0; while (i >= 0) dst[i--] = *d--; } else memcpy(dst, dbuf, sizeof(dbuf)); ret = 1; out: if (end) *end = s; return ret; } EXPORT_SYMBOL(in6_pton); static int inet4_pton(const char *src, u16 port_num, struct sockaddr_storage *addr) { struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; size_t srclen = strlen(src); if (srclen > INET_ADDRSTRLEN) return -EINVAL; if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr, '\n', NULL) == 0) return -EINVAL; addr4->sin_family = AF_INET; addr4->sin_port = htons(port_num); return 0; } static int inet6_pton(struct net *net, const char *src, u16 port_num, struct sockaddr_storage *addr) { struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; const char *scope_delim; size_t srclen = strlen(src); if (srclen > INET6_ADDRSTRLEN) return -EINVAL; if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr, '%', &scope_delim) == 0) return -EINVAL; if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL && src + srclen != scope_delim && *scope_delim == '%') { struct net_device *dev; char scope_id[16]; size_t scope_len = min_t(size_t, sizeof(scope_id) - 1, src + srclen - scope_delim - 1); memcpy(scope_id, scope_delim + 1, scope_len); scope_id[scope_len] = '\0'; dev = dev_get_by_name(net, scope_id); if (dev) { addr6->sin6_scope_id = dev->ifindex; dev_put(dev); } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) { return -EINVAL; } } addr6->sin6_family = AF_INET6; addr6->sin6_port = htons(port_num); return 0; } /** * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address * @net: net namespace (used for scope handling) * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either * @src: the start of the address string * @port: the start of the port string (or NULL for none) * @addr: output socket address * * Return zero on success, return errno when any error occurs. */ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, const char *src, const char *port, struct sockaddr_storage *addr) { u16 port_num; int ret = -EINVAL; if (port) { if (kstrtou16(port, 0, &port_num)) return -EINVAL; } else { port_num = 0; } switch (af) { case AF_INET: ret = inet4_pton(src, port_num, addr); break; case AF_INET6: ret = inet6_pton(net, src, port_num, addr); break; case AF_UNSPEC: ret = inet4_pton(src, port_num, addr); if (ret) ret = inet6_pton(net, src, port_num, addr); break; default: pr_err("unexpected address family %d\n", af); } return ret; } EXPORT_SYMBOL(inet_pton_with_scope); bool inet_addr_is_any(struct sockaddr *addr) { if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; const struct sockaddr_in6 in6_any = { .sin6_addr = IN6ADDR_ANY_INIT }; if (!memcmp(in6->sin6_addr.s6_addr, in6_any.sin6_addr.s6_addr, 16)) return true; } else if (addr->sa_family == AF_INET) { struct sockaddr_in *in = (struct sockaddr_in *)addr; if (in->sin_addr.s_addr == htonl(INADDR_ANY)) return true; } else { pr_warn("unexpected address family %u\n", addr->sa_family); } return false; } EXPORT_SYMBOL(inet_addr_is_any); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace4(sum, from, to); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) skb->csum = ~csum_add(csum_sub(~(skb->csum), (__force __wsum)from), (__force __wsum)to); } else if (pseudohdr) *sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), (__force __wsum)from), (__force __wsum)to)); } EXPORT_SYMBOL(inet_proto_csum_replace4); /** * inet_proto_csum_replace16 - update layer 4 header checksum field * @sum: Layer 4 header checksum field * @skb: sk_buff for the packet * @from: old IPv6 address * @to: new IPv6 address * @pseudohdr: True if layer 4 header checksum includes pseudoheader * * Update layer 4 header as per the update in IPv6 src/dst address. * * There is no need to update skb->csum in this function, because update in two * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to * update skb->csum, because update in 3 fields a.) IPv4 src/dst address, * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as * L4 Header checksum for skb->csum calculation. */ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], to[0], to[1], to[2], to[3], }; if (skb->ip_summed != CHECKSUM_PARTIAL) { *sum = csum_fold(csum_partial(diff, sizeof(diff), ~csum_unfold(*sum))); } else if (pseudohdr) *sum = ~csum_fold(csum_partial(diff, sizeof(diff), csum_unfold(*sum))); } EXPORT_SYMBOL(inet_proto_csum_replace16); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace_by_diff(sum, diff); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); } } EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); |
8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <net/ipv6.h> #include <net/icmp.h> #include <net/udp.h> #include <net/tcp.h> #include <net/route.h> #include <linux/netfilter.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/xt_LOG.h> #include <net/netfilter/nf_log.h> static const struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { .level = LOGLEVEL_NOTICE, .logflags = NF_LOG_DEFAULT_MASK, }, }, }; struct arppayload { unsigned char mac_src[ETH_ALEN]; unsigned char ip_src[4]; unsigned char mac_dst[ETH_ALEN]; unsigned char ip_dst[4]; }; /* Guard against containers flooding syslog. */ static bool nf_log_allowed(const struct net *net) { return net_eq(net, &init_net) || sysctl_nf_log_all_netns; } static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) { u16 vid; if (!skb_vlan_tag_present(skb)) return; vid = skb_vlan_tag_get(skb); nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); } static void noinline_for_stack dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int nhoff) { const struct arppayload *ap; struct arppayload _arpp; const struct arphdr *ah; unsigned int logflags; struct arphdr _arph; ah = skb_header_pointer(skb, nhoff, sizeof(_arph), &_arph); if (!ah) { nf_log_buf_add(m, "TRUNCATED"); return; } if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else logflags = NF_LOG_DEFAULT_MASK; if (logflags & NF_LOG_MACDECODE) { nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); nf_log_dump_vlan(m, skb); nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); } nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); /* If it's for Ethernet and the lengths are OK, then log the ARP * payload. */ if (ah->ar_hrd != htons(ARPHRD_ETHER) || ah->ar_hln != ETH_ALEN || ah->ar_pln != sizeof(__be32)) return; ap = skb_header_pointer(skb, nhoff + sizeof(_arph), sizeof(_arpp), &_arpp); if (!ap) { nf_log_buf_add(m, " INCOMPLETE [%zu bytes]", skb->len - sizeof(_arph)); return; } nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); } static void nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix, struct net *net) { const struct net_device *physoutdev __maybe_unused; const struct net_device *physindev __maybe_unused; nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) physindev = nf_bridge_get_physindev(skb, net); if (physindev && in != physindev) nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); physoutdev = nf_bridge_get_physoutdev(skb); if (physoutdev && out != physoutdev) nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); #endif } static void nf_log_arp_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); dump_arp_packet(m, loginfo, skb, skb_network_offset(skb)); nf_log_buf_close(m); } static struct nf_logger nf_arp_logger __read_mostly = { .name = "nf_log_arp", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_arp_packet, .me = THIS_MODULE, }; static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk) { if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) return; read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { const struct cred *cred = sk->sk_socket->file->f_cred; nf_log_buf_add(m, "UID=%u GID=%u ", from_kuid_munged(&init_user_ns, cred->fsuid), from_kgid_munged(&init_user_ns, cred->fsgid)); } read_unlock_bh(&sk->sk_callback_lock); } static noinline_for_stack int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, u8 proto, int fragment, unsigned int offset, unsigned int logflags) { struct tcphdr _tcph; const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ nf_log_buf_add(m, "PROTO=TCP "); if (fragment) return 0; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (!th) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); return 1; } /* Max length: 20 "SPT=65535 DPT=65535 " */ nf_log_buf_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & NF_LOG_TCPSEQ) { nf_log_buf_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); } /* Max length: 13 "WINDOW=65535 " */ nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3C " */ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) nf_log_buf_add(m, "CWR "); if (th->ece) nf_log_buf_add(m, "ECE "); if (th->urg) nf_log_buf_add(m, "URG "); if (th->ack) nf_log_buf_add(m, "ACK "); if (th->psh) nf_log_buf_add(m, "PSH "); if (th->rst) nf_log_buf_add(m, "RST "); if (th->syn) nf_log_buf_add(m, "SYN "); if (th->fin) nf_log_buf_add(m, "FIN "); /* Max length: 11 "URGP=65535 " */ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr); u8 _opt[60 - sizeof(struct tcphdr)]; unsigned int i; const u8 *op; op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), optsize, _opt); if (!op) { nf_log_buf_add(m, "OPT (TRUNCATED)"); return 1; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ nf_log_buf_add(m, "OPT ("); for (i = 0; i < optsize; i++) nf_log_buf_add(m, "%02X", op[i]); nf_log_buf_add(m, ") "); } return 0; } static noinline_for_stack int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, u8 proto, int fragment, unsigned int offset) { struct udphdr _udph; const struct udphdr *uh; if (proto == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ nf_log_buf_add(m, "PROTO=UDP "); else /* Max length: 14 "PROTO=UDPLITE " */ nf_log_buf_add(m, "PROTO=UDPLITE "); if (fragment) goto out; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (!uh) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); return 1; } /* Max length: 20 "SPT=65535 DPT=65535 " */ nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); out: return 0; } /* One level of recursion won't kill us */ static noinline_for_stack void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { const struct iphdr *ih; unsigned int logflags; struct iphdr _iph; if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (!ih) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. * Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) nf_log_buf_add(m, "CE "); if (ntohs(ih->frag_off) & IP_DF) nf_log_buf_add(m, "DF "); if (ntohs(ih->frag_off) & IP_MF) nf_log_buf_add(m, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & NF_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; const unsigned char *op; unsigned int i, optsize; optsize = ih->ihl * 4 - sizeof(struct iphdr); op = skb_header_pointer(skb, iphoff + sizeof(_iph), optsize, _opt); if (!op) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ nf_log_buf_add(m, "OPT ("); for (i = 0; i < optsize; i++) nf_log_buf_add(m, "%02X", op[i]); nf_log_buf_add(m, ") "); } switch (ih->protocol) { case IPPROTO_TCP: if (nf_log_dump_tcp_header(m, skb, ih->protocol, ntohs(ih->frag_off) & IP_OFFSET, iphoff + ih->ihl * 4, logflags)) return; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: if (nf_log_dump_udp_header(m, skb, ih->protocol, ntohs(ih->frag_off) & IP_OFFSET, iphoff + ih->ihl * 4)) return; break; case IPPROTO_ICMP: { static const size_t required_len[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = 4, [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr), [ICMP_SOURCE_QUENCH] = 8 + sizeof(struct iphdr), [ICMP_REDIRECT] = 8 + sizeof(struct iphdr), [ICMP_ECHO] = 4, [ICMP_TIME_EXCEEDED] = 8 + sizeof(struct iphdr), [ICMP_PARAMETERPROB] = 8 + sizeof(struct iphdr), [ICMP_TIMESTAMP] = 20, [ICMP_TIMESTAMPREPLY] = 20, [ICMP_ADDRESS] = 12, [ICMP_ADDRESSREPLY] = 12 }; const struct icmphdr *ich; struct icmphdr _icmph; /* Max length: 11 "PROTO=ICMP " */ nf_log_buf_add(m, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (!ich) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len - iphoff - ih->ihl * 4 < required_len[ich->type]) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } switch (ich->type) { case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ nf_log_buf_add(m, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ nf_log_buf_add(m, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); fallthrough; case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl * 4 + sizeof(_icmph)); nf_log_buf_add(m, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) { nf_log_buf_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); } } break; } /* Max Length */ case IPPROTO_AH: { const struct ip_auth_hdr *ah; struct ip_auth_hdr _ahdr; if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 9 "PROTO=AH " */ nf_log_buf_add(m, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_ahdr), &_ahdr); if (!ah) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } /* Length: 15 "SPI=0xF1234567 " */ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { const struct ip_esp_hdr *eh; struct ip_esp_hdr _esph; /* Max length: 10 "PROTO=ESP " */ nf_log_buf_add(m, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ eh = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_esph), &_esph); if (!eh) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl * 4); break; } /* Length: 15 "SPI=0xF1234567 " */ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: nf_log_buf_add(m, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) nf_log_buf_add(m, "MARK=0x%x ", skb->mark); /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ /* UDP: 10+max(25,20) = 35 */ /* UDPLITE: 14+max(25,20) = 39 */ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ /* ESP: 10+max(25)+15 = 50 */ /* AH: 9+max(25)+15 = 49 */ /* unknown: 10 */ /* (ICMP allows recursion one level deep) */ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ /* maxlen = 230+ 91 + 230 + 252 = 803 */ } static noinline_for_stack void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) { const struct ipv6hdr *ih; unsigned int hdrlen = 0; unsigned int logflags; struct ipv6hdr _ip6h; unsigned int ptr; u8 currenthdr; int fragment; if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (!ih) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", ntohs(ih->payload_len) + sizeof(struct ipv6hdr), (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, ih->hop_limit, (ntohl(*(__be32 *)ih) & 0x000fffff)); fragment = 0; ptr = ip6hoff + sizeof(struct ipv6hdr); currenthdr = ih->nexthdr; while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) { struct ipv6_opt_hdr _hdr; const struct ipv6_opt_hdr *hp; hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); if (!hp) { nf_log_buf_add(m, "TRUNCATED"); return; } /* Max length: 48 "OPT (...) " */ if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, "OPT ( "); switch (currenthdr) { case IPPROTO_FRAGMENT: { struct frag_hdr _fhdr; const struct frag_hdr *fh; nf_log_buf_add(m, "FRAG:"); fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), &_fhdr); if (!fh) { nf_log_buf_add(m, "TRUNCATED "); return; } /* Max length: 6 "65535 " */ nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); /* Max length: 11 "INCOMPLETE " */ if (fh->frag_off & htons(0x0001)) nf_log_buf_add(m, "INCOMPLETE "); nf_log_buf_add(m, "ID:%08x ", ntohl(fh->identification)); if (ntohs(fh->frag_off) & 0xFFF8) fragment = 1; hdrlen = 8; break; } case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: if (fragment) { if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ")"); return; } hdrlen = ipv6_optlen(hp); break; /* Max Length */ case IPPROTO_AH: if (logflags & NF_LOG_IPOPT) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; /* Max length: 3 "AH " */ nf_log_buf_add(m, "AH "); if (fragment) { nf_log_buf_add(m, ")"); return; } ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), &_ahdr); if (!ah) { /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 15 "SPI=0xF1234567 */ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); } hdrlen = ipv6_authlen(hp); break; case IPPROTO_ESP: if (logflags & NF_LOG_IPOPT) { struct ip_esp_hdr _esph; const struct ip_esp_hdr *eh; /* Max length: 4 "ESP " */ nf_log_buf_add(m, "ESP "); if (fragment) { nf_log_buf_add(m, ")"); return; } /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ eh = skb_header_pointer(skb, ptr, sizeof(_esph), &_esph); if (!eh) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 16 "SPI=0xF1234567 )" */ nf_log_buf_add(m, "SPI=0x%x )", ntohl(eh->spi)); } return; default: /* Max length: 20 "Unknown Ext Hdr 255" */ nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); return; } if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ") "); currenthdr = hp->nexthdr; ptr += hdrlen; } switch (currenthdr) { case IPPROTO_TCP: if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, ptr, logflags)) return; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) return; break; case IPPROTO_ICMPV6: { struct icmp6hdr _icmp6h; const struct icmp6hdr *ic; /* Max length: 13 "PROTO=ICMPv6 " */ nf_log_buf_add(m, "PROTO=ICMPv6 "); if (fragment) break; /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); if (!ic) { nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 18 "TYPE=255 CODE=255 " */ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); switch (ic->icmp6_type) { case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: /* Max length: 19 "ID=65535 SEQ=65535 " */ nf_log_buf_add(m, "ID=%u SEQ=%u ", ntohs(ic->icmp6_identifier), ntohs(ic->icmp6_sequence)); break; case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: break; case ICMPV6_PARAMPROB: /* Max length: 17 "POINTER=ffffffff " */ nf_log_buf_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); fallthrough; case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: /* Max length: 3+maxlen */ if (recurse) { nf_log_buf_add(m, "["); dump_ipv6_packet(net, m, info, skb, ptr + sizeof(_icmp6h), 0); nf_log_buf_add(m, "] "); } /* Max length: 10 "MTU=65535 " */ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { nf_log_buf_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); } } break; } /* Max length: 10 "PROTO=255 " */ default: nf_log_buf_add(m, "PROTO=%u ", currenthdr); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && recurse) nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (recurse && skb->mark) nf_log_buf_add(m, "MARK=0x%x ", skb->mark); } static void dump_mac_header(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb) { struct net_device *dev = skb->dev; unsigned int logflags = 0; if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; if (!(logflags & NF_LOG_MACDECODE)) goto fallback; switch (dev->type) { case ARPHRD_ETHER: nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); nf_log_dump_vlan(m, skb); nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); return; default: break; } fallback: nf_log_buf_add(m, "MAC="); if (dev->hard_header_len && skb->mac_header != skb->network_header) { const unsigned char *p = skb_mac_header(skb); unsigned int i; if (dev->type == ARPHRD_SIT) { p -= ETH_HLEN; if (p < skb->head) p = NULL; } if (p) { nf_log_buf_add(m, "%02x", *p++); for (i = 1; i < dev->hard_header_len; i++) nf_log_buf_add(m, ":%02x", *p++); } if (dev->type == ARPHRD_SIT) { const struct iphdr *iph = (struct iphdr *)skb_mac_header(skb); nf_log_buf_add(m, " TUNNEL=%pI4->%pI4", &iph->saddr, &iph->daddr); } } nf_log_buf_add(m, " "); } static void nf_log_ip_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); dump_ipv4_packet(net, m, loginfo, skb, skb_network_offset(skb)); nf_log_buf_close(m); } static struct nf_logger nf_ip_logger __read_mostly = { .name = "nf_log_ipv4", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_ip_packet, .me = THIS_MODULE, }; static void nf_log_ip6_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); if (in) dump_mac_header(m, loginfo, skb); dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); nf_log_buf_close(m); } static struct nf_logger nf_ip6_logger __read_mostly = { .name = "nf_log_ipv6", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_ip6_packet, .me = THIS_MODULE, }; static void nf_log_unknown_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { struct nf_log_buf *m; if (!nf_log_allowed(net)) return; m = nf_log_buf_open(); if (!loginfo) loginfo = &default_loginfo; nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix, net); dump_mac_header(m, loginfo, skb); nf_log_buf_close(m); } static void nf_log_netdev_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { switch (skb->protocol) { case htons(ETH_P_IP): nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; case htons(ETH_P_IPV6): nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; default: nf_log_unknown_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); break; } } static struct nf_logger nf_netdev_logger __read_mostly = { .name = "nf_log_netdev", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_netdev_packet, .me = THIS_MODULE, }; static struct nf_logger nf_bridge_logger __read_mostly = { .name = "nf_log_bridge", .type = NF_LOG_TYPE_LOG, .logfn = nf_log_netdev_packet, .me = THIS_MODULE, }; static int __net_init nf_log_syslog_net_init(struct net *net) { int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); if (ret) return ret; ret = nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); if (ret) goto err1; ret = nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); if (ret) goto err2; ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger); if (ret) goto err3; ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); if (ret) goto err4; return 0; err4: nf_log_unset(net, &nf_netdev_logger); err3: nf_log_unset(net, &nf_ip6_logger); err2: nf_log_unset(net, &nf_arp_logger); err1: nf_log_unset(net, &nf_ip_logger); return ret; } static void __net_exit nf_log_syslog_net_exit(struct net *net) { nf_log_unset(net, &nf_ip_logger); nf_log_unset(net, &nf_arp_logger); nf_log_unset(net, &nf_ip6_logger); nf_log_unset(net, &nf_netdev_logger); nf_log_unset(net, &nf_bridge_logger); } static struct pernet_operations nf_log_syslog_net_ops = { .init = nf_log_syslog_net_init, .exit = nf_log_syslog_net_exit, }; static int __init nf_log_syslog_init(void) { int ret; ret = register_pernet_subsys(&nf_log_syslog_net_ops); if (ret < 0) return ret; ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); if (ret < 0) goto err1; ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); if (ret < 0) goto err2; ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); if (ret < 0) goto err3; ret = nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger); if (ret < 0) goto err4; ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); if (ret < 0) goto err5; return 0; err5: nf_log_unregister(&nf_netdev_logger); err4: nf_log_unregister(&nf_ip6_logger); err3: nf_log_unregister(&nf_arp_logger); err2: nf_log_unregister(&nf_ip_logger); err1: pr_err("failed to register logger\n"); unregister_pernet_subsys(&nf_log_syslog_net_ops); return ret; } static void __exit nf_log_syslog_exit(void) { unregister_pernet_subsys(&nf_log_syslog_net_ops); nf_log_unregister(&nf_ip_logger); nf_log_unregister(&nf_arp_logger); nf_log_unregister(&nf_ip6_logger); nf_log_unregister(&nf_netdev_logger); nf_log_unregister(&nf_bridge_logger); } module_init(nf_log_syslog_init); module_exit(nf_log_syslog_exit); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Netfilter syslog packet logging"); MODULE_LICENSE("GPL"); MODULE_ALIAS("nf_log_arp"); MODULE_ALIAS("nf_log_bridge"); MODULE_ALIAS("nf_log_ipv4"); MODULE_ALIAS("nf_log_ipv6"); MODULE_ALIAS("nf_log_netdev"); MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); MODULE_ALIAS_NF_LOGGER(AF_INET, 0); MODULE_ALIAS_NF_LOGGER(3, 0); MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */ MODULE_ALIAS_NF_LOGGER(AF_INET6, 0); |
6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 1 17 17 11 3 7 7 11 17 9 1 8 8 8 1 8 8 2 6 15 15 15 15 3 3 3 3 8 8 8 8 1 1 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 | /* * llc_conn.c - Driver routines for connection component. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/init.h> #include <linux/slab.h> #include <net/llc.h> #include <net/llc_c_ac.h> #include <net/llc_c_ev.h> #include <net/llc_c_st.h> #include <net/llc_conn.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/tcp_states.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif static int llc_find_offset(int state, int ev_type); static void llc_conn_send_pdus(struct sock *sk); static int llc_conn_service(struct sock *sk, struct sk_buff *skb); static int llc_exec_conn_trans_actions(struct sock *sk, const struct llc_conn_state_trans *trans, struct sk_buff *ev); static const struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, struct sk_buff *skb); /* Offset table on connection states transition diagram */ static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV]; int sysctl_llc2_ack_timeout = LLC2_ACK_TIME * HZ; int sysctl_llc2_p_timeout = LLC2_P_TIME * HZ; int sysctl_llc2_rej_timeout = LLC2_REJ_TIME * HZ; int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ; /** * llc_conn_state_process - sends event to connection state machine * @sk: connection * @skb: occurred event * * Sends an event to connection state machine. After processing event * (executing it's actions and changing state), upper layer will be * indicated or confirmed, if needed. Returns 0 for success, 1 for * failure. The socket lock has to be held before calling this function. * * This function always consumes a reference to the skb. */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(skb->sk); struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->ind_prim = ev->cfm_prim = 0; /* * Send event to state machine */ rc = llc_conn_service(skb->sk, skb); if (unlikely(rc != 0)) { printk(KERN_ERR "%s: llc_conn_service failed\n", __func__); goto out_skb_put; } switch (ev->ind_prim) { case LLC_DATA_PRIM: skb_get(skb); llc_save_primitive(sk, skb, LLC_DATA_PRIM); if (unlikely(sock_queue_rcv_skb(sk, skb))) { /* * shouldn't happen */ printk(KERN_ERR "%s: sock_queue_rcv_skb failed!\n", __func__); kfree_skb(skb); } break; case LLC_CONN_PRIM: /* * Can't be sock_queue_rcv_skb, because we have to leave the * skb->sk pointing to the newly created struct sock in * llc_conn_handler. -acme */ skb_get(skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_state_change(sk); break; case LLC_DISC_PRIM: sock_hold(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_ESTABLISHED) { sk->sk_shutdown = SHUTDOWN_MASK; sk->sk_socket->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; if (!sock_flag(sk, SOCK_DEAD)) { sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); } } sock_put(sk); break; case LLC_RESET_PRIM: /* * FIXME: * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset ind!\n", __func__); break; default: if (ev->ind_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->ind_prim); /* No indication */ break; } switch (ev->cfm_prim) { case LLC_DATA_PRIM: if (!llc_data_accept_state(llc->state)) sk->sk_write_space(sk); else rc = llc->failed_data_req = 1; break; case LLC_CONN_PRIM: if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_SYN_SENT) { if (ev->status) { sk->sk_socket->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; } else { sk->sk_socket->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; } sk->sk_state_change(sk); } break; case LLC_DISC_PRIM: sock_hold(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSING) { sk->sk_socket->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; sk->sk_state_change(sk); } sock_put(sk); break; case LLC_RESET_PRIM: /* * FIXME: * RESET is not being notified to upper layers for now */ printk(KERN_INFO "%s: received a reset conf!\n", __func__); break; default: if (ev->cfm_prim) printk(KERN_INFO "%s: received unknown %d prim!\n", __func__, ev->cfm_prim); /* No confirmation */ break; } out_skb_put: kfree_skb(skb); return rc; } void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) { /* queue PDU to send to MAC layer */ skb_queue_tail(&sk->sk_write_queue, skb); llc_conn_send_pdus(sk); } /** * llc_conn_rtn_pdu - sends received data pdu to upper layer * @sk: Active connection * @skb: Received data frame * * Sends received data pdu to upper layer (by using indicate function). * Prepares service parameters (prim and prim_data). calling indication * function will be done in llc_conn_state_process. */ void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->ind_prim = LLC_DATA_PRIM; } /** * llc_conn_resend_i_pdu_as_cmd - resend all all unacknowledged I PDUs * @sk: active connection * @nr: NR * @first_p_bit: p_bit value of first pdu * * Resend all unacknowledged I PDUs, starting with the NR; send first as * command PDU with P bit equal first_p_bit; if more than one send * subsequent as command PDUs with P bit equal zero (0). */ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) { struct sk_buff *skb; struct llc_pdu_sn *pdu; u16 nbr_unack_pdus; struct llc_sock *llc; u8 howmany_resend = 0; llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); if (!nbr_unack_pdus) goto out; /* * Process unack PDUs only if unack queue is not empty; remove * appropriate PDUs, fix them up, and put them on mac_pdu_q. */ llc = llc_sk(sk); while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { pdu = llc_pdu_sn_hdr(skb); llc_pdu_set_cmd_rsp(skb, LLC_PDU_CMD); llc_pdu_set_pf_bit(skb, first_p_bit); skb_queue_tail(&sk->sk_write_queue, skb); first_p_bit = 0; llc->vS = LLC_I_GET_NS(pdu); howmany_resend++; } if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ llc_conn_send_pdus(sk); out:; } /** * llc_conn_resend_i_pdu_as_rsp - Resend all unacknowledged I PDUs * @sk: active connection. * @nr: NR * @first_f_bit: f_bit value of first pdu. * * Resend all unacknowledged I PDUs, starting with the NR; send first as * response PDU with F bit equal first_f_bit; if more than one send * subsequent as response PDUs with F bit equal zero (0). */ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) { struct sk_buff *skb; u16 nbr_unack_pdus; struct llc_sock *llc = llc_sk(sk); u8 howmany_resend = 0; llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); if (!nbr_unack_pdus) goto out; /* * Process unack PDUs only if unack queue is not empty; remove * appropriate PDUs, fix them up, and put them on mac_pdu_q */ while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); llc_pdu_set_cmd_rsp(skb, LLC_PDU_RSP); llc_pdu_set_pf_bit(skb, first_f_bit); skb_queue_tail(&sk->sk_write_queue, skb); first_f_bit = 0; llc->vS = LLC_I_GET_NS(pdu); howmany_resend++; } if (howmany_resend > 0) llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; /* any PDUs to re-send are queued up; start sending to MAC */ llc_conn_send_pdus(sk); out:; } /** * llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue * @sk: active connection * @nr: NR * @how_many_unacked: size of pdu_unack_q after removing acked pdus * * Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns * the number of pdus that removed from queue. */ int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked) { int pdu_pos, i; struct sk_buff *skb; struct llc_pdu_sn *pdu; int nbr_acked = 0; struct llc_sock *llc = llc_sk(sk); int q_len = skb_queue_len(&llc->pdu_unack_q); if (!q_len) goto out; skb = skb_peek(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); /* finding position of last acked pdu in queue */ pdu_pos = ((int)LLC_2_SEQ_NBR_MODULO + (int)nr - (int)LLC_I_GET_NS(pdu)) % LLC_2_SEQ_NBR_MODULO; for (i = 0; i < pdu_pos && i < q_len; i++) { skb = skb_dequeue(&llc->pdu_unack_q); kfree_skb(skb); nbr_acked++; } out: *how_many_unacked = skb_queue_len(&llc->pdu_unack_q); return nbr_acked; } /** * llc_conn_send_pdus - Sends queued PDUs * @sk: active connection * * Sends queued pdus to MAC layer for transmission. */ static void llc_conn_send_pdus(struct sock *sk) { struct sk_buff *skb; while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_TYPE_IS_I(pdu) && !(skb->dev->flags & IFF_LOOPBACK)) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); if (!skb2) break; skb = skb2; } dev_queue_xmit(skb); } } /** * llc_conn_service - finds transition and changes state of connection * @sk: connection * @skb: happened event * * This function finds transition that matches with happened event, then * executes related actions and finally changes state of connection. * Returns 0 for success, 1 for failure. */ static int llc_conn_service(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_trans *trans; struct llc_sock *llc = llc_sk(sk); int rc = 1; if (llc->state > NBR_CONN_STATES) goto out; rc = 0; trans = llc_qualify_conn_ev(sk, skb); if (trans) { rc = llc_exec_conn_trans_actions(sk, trans, skb); if (!rc && trans->next_state != NO_STATE_CHANGE) { llc->state = trans->next_state; if (!llc_data_accept_state(llc->state)) sk->sk_state_change(sk); } } out: return rc; } /** * llc_qualify_conn_ev - finds transition for event * @sk: connection * @skb: happened event * * This function finds transition that matches with happened event. * Returns pointer to found transition on success, %NULL otherwise. */ static const struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_trans **next_trans; const llc_conn_ev_qfyr_t *next_qualifier; struct llc_conn_state_ev *ev = llc_conn_ev(skb); struct llc_sock *llc = llc_sk(sk); struct llc_conn_state *curr_state = &llc_conn_state_table[llc->state - 1]; /* search thru events for this state until * list exhausted or until no more */ for (next_trans = curr_state->transitions + llc_find_offset(llc->state - 1, ev->type); (*next_trans)->ev; next_trans++) { if (!((*next_trans)->ev)(sk, skb)) { /* got POSSIBLE event match; the event may require * qualification based on the values of a number of * state flags; if all qualifications are met (i.e., * if all qualifying functions return success, or 0, * then this is THE event we're looking for */ for (next_qualifier = (*next_trans)->ev_qualifiers; next_qualifier && *next_qualifier && !(*next_qualifier)(sk, skb); next_qualifier++) /* nothing */; if (!next_qualifier || !*next_qualifier) /* all qualifiers executed successfully; this is * our transition; return it so we can perform * the associated actions & change the state */ return *next_trans; } } return NULL; } /** * llc_exec_conn_trans_actions - executes related actions * @sk: connection * @trans: transition that it's actions must be performed * @skb: event * * Executes actions that is related to happened event. Returns 0 for * success, 1 to indicate failure of at least one action. */ static int llc_exec_conn_trans_actions(struct sock *sk, const struct llc_conn_state_trans *trans, struct sk_buff *skb) { int rc = 0; const llc_conn_action_t *next_action; for (next_action = trans->ev_actions; next_action && *next_action; next_action++) { int rc2 = (*next_action)(sk, skb); if (rc2 == 2) { rc = rc2; break; } else if (rc2) rc = 1; } return rc; } static inline bool llc_estab_match(const struct llc_sap *sap, const struct llc_addr *daddr, const struct llc_addr *laddr, const struct sock *sk, const struct net *net) { struct llc_sock *llc = llc_sk(sk); return net_eq(sock_net(sk), net) && llc->laddr.lsap == laddr->lsap && llc->daddr.lsap == daddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac) && ether_addr_equal(llc->daddr.mac, daddr->mac); } /** * __llc_lookup_established - Finds connection for the remote/local sap/mac * @sap: SAP * @daddr: address of remote LLC (MAC + SAP) * @laddr: address of local LLC (MAC + SAP) * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection using the remote * mac, remote sap, local mac, and local sap. Returns pointer for * connection found, %NULL otherwise. * Caller has to make sure local_bh is disabled. */ static struct sock *__llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, struct llc_addr *laddr, const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_estab_match(sap, daddr, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_estab_match(sap, daddr, laddr, rc, net))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock(); return rc; } struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, struct llc_addr *laddr, const struct net *net) { struct sock *sk; local_bh_disable(); sk = __llc_lookup_established(sap, daddr, laddr, net); local_bh_enable(); return sk; } static inline bool llc_listener_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sock *sk, const struct net *net) { struct llc_sock *llc = llc_sk(sk); return net_eq(sock_net(sk), net) && sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } static struct sock *__llc_lookup_listener(struct llc_sap *sap, struct llc_addr *laddr, const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_listener_match(sap, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_listener_match(sap, laddr, rc, net))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock(); return rc; } /** * llc_lookup_listener - Finds listener for local MAC + SAP * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection listening on * local mac, and local sap. Returns pointer for parent socket found, * %NULL otherwise. * Caller has to make sure local_bh is disabled. */ static struct sock *llc_lookup_listener(struct llc_sap *sap, struct llc_addr *laddr, const struct net *net) { struct sock *rc = __llc_lookup_listener(sap, laddr, net); static struct llc_addr null_addr; if (!rc) rc = __llc_lookup_listener(sap, &null_addr, net); return rc; } static struct sock *__llc_lookup(struct llc_sap *sap, struct llc_addr *daddr, struct llc_addr *laddr, const struct net *net) { struct sock *sk = __llc_lookup_established(sap, daddr, laddr, net); return sk ? : llc_lookup_listener(sap, laddr, net); } /** * llc_data_accept_state - designates if in this state data can be sent. * @state: state of connection. * * Returns 0 if data can be sent, 1 otherwise. */ u8 llc_data_accept_state(u8 state) { return state != LLC_CONN_STATE_NORMAL && state != LLC_CONN_STATE_BUSY && state != LLC_CONN_STATE_REJ; } /** * llc_find_next_offset - finds offset for next category of transitions * @state: state table. * @offset: start offset. * * Finds offset of next category of transitions in transition table. * Returns the start index of next category. */ static u16 __init llc_find_next_offset(struct llc_conn_state *state, u16 offset) { const struct llc_conn_state_trans **next_trans; u16 cnt = 0; for (next_trans = state->transitions + offset; (*next_trans)->ev; next_trans++) ++cnt; return cnt; } /** * llc_build_offset_table - builds offset table of connection * * Fills offset table of connection state transition table * (llc_offset_table). */ void __init llc_build_offset_table(void) { struct llc_conn_state *curr_state; int state, ev_type, next_offset; for (state = 0; state < NBR_CONN_STATES; state++) { curr_state = &llc_conn_state_table[state]; next_offset = 0; for (ev_type = 0; ev_type < NBR_CONN_EV; ev_type++) { llc_offset_table[state][ev_type] = next_offset; next_offset += llc_find_next_offset(curr_state, next_offset) + 1; } } } /** * llc_find_offset - finds start offset of category of transitions * @state: state of connection * @ev_type: type of happened event * * Finds start offset of desired category of transitions. Returns the * desired start offset. */ static int llc_find_offset(int state, int ev_type) { int rc = 0; /* at this stage, llc_offset_table[..][2] is not important. it is for * init_pf_cycle and I don't know what is it. */ switch (ev_type) { case LLC_CONN_EV_TYPE_PRIM: rc = llc_offset_table[state][0]; break; case LLC_CONN_EV_TYPE_PDU: rc = llc_offset_table[state][4]; break; case LLC_CONN_EV_TYPE_SIMPLE: rc = llc_offset_table[state][1]; break; case LLC_CONN_EV_TYPE_P_TMR: case LLC_CONN_EV_TYPE_ACK_TMR: case LLC_CONN_EV_TYPE_REJ_TMR: case LLC_CONN_EV_TYPE_BUSY_TMR: rc = llc_offset_table[state][3]; break; } return rc; } /** * llc_sap_add_socket - adds a socket to a SAP * @sap: SAP * @sk: socket * * This function adds a socket to the hash tables of a SAP. */ void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk) { struct llc_sock *llc = llc_sk(sk); struct hlist_head *dev_hb = llc_sk_dev_hash(sap, llc->dev->ifindex); struct hlist_nulls_head *laddr_hb = llc_sk_laddr_hash(sap, &llc->laddr); llc_sap_hold(sap); llc_sk(sk)->sap = sap; spin_lock_bh(&sap->sk_lock); sock_set_flag(sk, SOCK_RCU_FREE); sap->sk_count++; sk_nulls_add_node_rcu(sk, laddr_hb); hlist_add_head(&llc->dev_hash_node, dev_hb); spin_unlock_bh(&sap->sk_lock); } /** * llc_sap_remove_socket - removes a socket from SAP * @sap: SAP * @sk: socket * * This function removes a connection from the hash tables of a SAP if * the connection was in this list. */ void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk) { struct llc_sock *llc = llc_sk(sk); spin_lock_bh(&sap->sk_lock); sk_nulls_del_node_init_rcu(sk); hlist_del(&llc->dev_hash_node); sap->sk_count--; spin_unlock_bh(&sap->sk_lock); llc_sap_put(sap); } /** * llc_conn_rcv - sends received pdus to the connection state machine * @sk: current connection structure. * @skb: received frame. * * Sends received pdus to the connection state machine. */ static int llc_conn_rcv(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PDU; ev->reason = 0; return llc_conn_state_process(sk, skb); } static struct sock *llc_create_incoming_sock(struct sock *sk, struct net_device *dev, struct llc_addr *saddr, struct llc_addr *daddr) { struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC, sk->sk_prot, 0); struct llc_sock *newllc, *llc = llc_sk(sk); if (!newsk) goto out; newllc = llc_sk(newsk); memcpy(&newllc->laddr, daddr, sizeof(newllc->laddr)); memcpy(&newllc->daddr, saddr, sizeof(newllc->daddr)); newllc->dev = dev; dev_hold(dev); llc_sap_add_socket(llc->sap, newsk); llc_sap_hold(llc->sap); out: return newsk; } void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr saddr, daddr; struct sock *sk; llc_pdu_decode_sa(skb, saddr.mac); llc_pdu_decode_ssap(skb, &saddr.lsap); llc_pdu_decode_da(skb, daddr.mac); llc_pdu_decode_dsap(skb, &daddr.lsap); sk = __llc_lookup(sap, &saddr, &daddr, dev_net(skb->dev)); if (!sk) goto drop; bh_lock_sock(sk); /* * This has to be done here and not at the upper layer ->accept * method because of the way the PROCOM state machine works: * it needs to set several state variables (see, for instance, * llc_adm_actions_2 in net/llc/llc_c_st.c) and send a packet to * the originator of the new connection, and this state has to be * in the newly created struct sock private area. -acme */ if (unlikely(sk->sk_state == TCP_LISTEN)) { struct sock *newsk = llc_create_incoming_sock(sk, skb->dev, &saddr, &daddr); if (!newsk) goto drop_unlock; skb_set_owner_r(skb, newsk); } else { /* * Can't be skb_set_owner_r, this will be done at the * llc_conn_state_process function, later on, when we will use * skb_queue_rcv_skb to send it to upper layers, this is * another trick required to cope with how the PROCOM state * machine works. -acme */ skb_orphan(skb); sock_hold(sk); skb->sk = sk; skb->destructor = sock_efree; } if (!sock_owned_by_user(sk)) llc_conn_rcv(sk, skb); else { dprintk("%s: adding to backlog...\n", __func__); llc_set_backlog_type(skb, LLC_PACKET); if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) goto drop_unlock; } out: bh_unlock_sock(sk); sock_put(sk); return; drop: kfree_skb(skb); return; drop_unlock: kfree_skb(skb); goto out; } #undef LLC_REFCNT_DEBUG #ifdef LLC_REFCNT_DEBUG static atomic_t llc_sock_nr; #endif /** * llc_backlog_rcv - Processes rx frames and expired timers. * @sk: LLC sock (p8022 connection) * @skb: queued rx frame or event * * This function processes frames that has received and timers that has * expired during sending an I pdu (refer to data_req_handler). frames * queue by llc_rcv function (llc_mac.c) and timers queue by timer * callback functions(llc_c_ac.c). */ static int llc_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int rc = 0; struct llc_sock *llc = llc_sk(sk); if (likely(llc_backlog_type(skb) == LLC_PACKET)) { if (likely(llc->state > 1)) /* not closed */ rc = llc_conn_rcv(sk, skb); else goto out_kfree_skb; } else if (llc_backlog_type(skb) == LLC_EVENT) { /* timer expiration event */ if (likely(llc->state > 1)) /* not closed */ rc = llc_conn_state_process(sk, skb); else goto out_kfree_skb; } else { printk(KERN_ERR "%s: invalid skb in backlog\n", __func__); goto out_kfree_skb; } out: return rc; out_kfree_skb: kfree_skb(skb); goto out; } /** * llc_sk_init - Initializes a socket with default llc values. * @sk: socket to initialize. * * Initializes a socket with default llc values. */ static void llc_sk_init(struct sock *sk) { struct llc_sock *llc = llc_sk(sk); llc->state = LLC_CONN_STATE_ADM; llc->inc_cntr = llc->dec_cntr = 2; llc->dec_step = llc->connect_step = 1; timer_setup(&llc->ack_timer.timer, llc_conn_ack_tmr_cb, 0); llc->ack_timer.expire = sysctl_llc2_ack_timeout; timer_setup(&llc->pf_cycle_timer.timer, llc_conn_pf_cycle_tmr_cb, 0); llc->pf_cycle_timer.expire = sysctl_llc2_p_timeout; timer_setup(&llc->rej_sent_timer.timer, llc_conn_rej_tmr_cb, 0); llc->rej_sent_timer.expire = sysctl_llc2_rej_timeout; timer_setup(&llc->busy_state_timer.timer, llc_conn_busy_tmr_cb, 0); llc->busy_state_timer.expire = sysctl_llc2_busy_timeout; llc->n2 = 2; /* max retransmit */ llc->k = 2; /* tx win size, will adjust dynam */ llc->rw = 128; /* rx win size (opt and equal to * tx_win of remote LLC) */ skb_queue_head_init(&llc->pdu_unack_q); sk->sk_backlog_rcv = llc_backlog_rcv; } /** * llc_sk_alloc - Allocates LLC sock * @net: network namespace * @family: upper layer protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance * @kern: is this to be a kernel socket? * * Allocates a LLC sock and initializes it. Returns the new LLC sock * or %NULL if there's no memory available for one */ struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern) { struct sock *sk = sk_alloc(net, family, priority, prot, kern); if (!sk) goto out; llc_sk_init(sk); sock_init_data(NULL, sk); #ifdef LLC_REFCNT_DEBUG atomic_inc(&llc_sock_nr); printk(KERN_DEBUG "LLC socket %p created in %s, now we have %d alive\n", sk, __func__, atomic_read(&llc_sock_nr)); #endif out: return sk; } void llc_sk_stop_all_timers(struct sock *sk, bool sync) { struct llc_sock *llc = llc_sk(sk); if (sync) { del_timer_sync(&llc->pf_cycle_timer.timer); del_timer_sync(&llc->ack_timer.timer); del_timer_sync(&llc->rej_sent_timer.timer); del_timer_sync(&llc->busy_state_timer.timer); } else { del_timer(&llc->pf_cycle_timer.timer); del_timer(&llc->ack_timer.timer); del_timer(&llc->rej_sent_timer.timer); del_timer(&llc->busy_state_timer.timer); } llc->ack_must_be_send = 0; llc->ack_pf = 0; } /** * llc_sk_free - Frees a LLC socket * @sk: - socket to free * * Frees a LLC socket */ void llc_sk_free(struct sock *sk) { struct llc_sock *llc = llc_sk(sk); llc->state = LLC_CONN_OUT_OF_SVC; /* Stop all (possibly) running timers */ llc_sk_stop_all_timers(sk, true); #ifdef DEBUG_LLC_CONN_ALLOC printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __func__, skb_queue_len(&llc->pdu_unack_q), skb_queue_len(&sk->sk_write_queue)); #endif skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&llc->pdu_unack_q); #ifdef LLC_REFCNT_DEBUG if (refcount_read(&sk->sk_refcnt) != 1) { printk(KERN_DEBUG "Destruction of LLC sock %p delayed in %s, cnt=%d\n", sk, __func__, refcount_read(&sk->sk_refcnt)); printk(KERN_DEBUG "%d LLC sockets are still alive\n", atomic_read(&llc_sock_nr)); } else { atomic_dec(&llc_sock_nr); printk(KERN_DEBUG "LLC socket %p released in %s, %d are still alive\n", sk, __func__, atomic_read(&llc_sock_nr)); } #endif sock_put(sk); } /** * llc_sk_reset - resets a connection * @sk: LLC socket to reset * * Resets a connection to the out of service state. Stops its timers * and frees any frames in the queues of the connection. */ void llc_sk_reset(struct sock *sk) { struct llc_sock *llc = llc_sk(sk); llc_conn_ac_stop_all_timers(sk, NULL); skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&llc->pdu_unack_q); llc->remote_busy_flag = 0; llc->cause_flag = 0; llc->retry_count = 0; llc_conn_set_p_flag(sk, 0); llc->f_flag = 0; llc->s_flag = 0; llc->ack_pf = 0; llc->first_pdu_Ns = 0; llc->ack_must_be_send = 0; llc->dec_step = 1; llc->inc_cntr = 2; llc->dec_cntr = 2; llc->X = 0; llc->failed_data_req = 0 ; llc->last_nr = 0; } |
27 688 704 514 34 673 1595 18 2 1 8 63 1 90 82 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/nodemask.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> struct kernel_clone_args; /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 #ifdef CONFIG_CGROUPS enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ #ifdef CONFIG_DEBUG_CGROUP_REF void css_get(struct cgroup_subsys_state *css); void css_get_many(struct cgroup_subsys_state *css, unsigned int n); bool css_tryget(struct cgroup_subsys_state *css); bool css_tryget_online(struct cgroup_subsys_state *css); void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline #define CGROUP_REF_EXPORT(fn) #include <linux/cgroup_refcnt.h> #endif static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } extern struct mutex cgroup_mutex; static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } static inline void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestors[ancestor->level] == ancestor; } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; return cgrp->ancestors[ancestor_level]; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline bool cgroup_psi_enabled(void) { return false; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(struct cgroup *cgrp); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; struct css_set *root_cset; }; extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; } #endif /* !CONFIG_CGROUPS */ static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) free_cgroup_ns(ns); } #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); struct cgroup_of_peak *of_peak(struct kernfs_open_file *of); #endif /* _LINUX_CGROUP_H */ |
1 2 1 1 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2012 Xyratex Technology Limited */ /* * This is crypto api shash wrappers to crc32_le. */ #include <linux/unaligned.h> #include <linux/crc32.h> #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 /** No default init with ~0 */ static int crc32_cra_init(struct crypto_tfm *tfm) { u32 *key = crypto_tfm_ctx(tfm); *key = 0; return 0; } /* * Setting the seed allows arbitrary accumulators and flexible XOR policy * If your algorithm starts with ~0, then XOR with ~0 before you set * the seed. */ static int crc32_setkey(struct crypto_shash *hash, const u8 *key, unsigned int keylen) { u32 *mctx = crypto_shash_ctx(hash); if (keylen != sizeof(u32)) return -EINVAL; *mctx = get_unaligned_le32(key); return 0; } static int crc32_init(struct shash_desc *desc) { u32 *mctx = crypto_shash_ctx(desc->tfm); u32 *crcp = shash_desc_ctx(desc); *crcp = *mctx; return 0; } static int crc32_update(struct shash_desc *desc, const u8 *data, unsigned int len) { u32 *crcp = shash_desc_ctx(desc); *crcp = crc32_le_base(*crcp, data, len); return 0; } static int crc32_update_arch(struct shash_desc *desc, const u8 *data, unsigned int len) { u32 *crcp = shash_desc_ctx(desc); *crcp = crc32_le(*crcp, data, len); return 0; } /* No final XOR 0xFFFFFFFF, like crc32_le */ static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { put_unaligned_le32(crc32_le_base(*crcp, data, len), out); return 0; } static int __crc32_finup_arch(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { put_unaligned_le32(crc32_le(*crcp, data, len), out); return 0; } static int crc32_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crc32_finup(shash_desc_ctx(desc), data, len, out); } static int crc32_finup_arch(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crc32_finup_arch(shash_desc_ctx(desc), data, len, out); } static int crc32_final(struct shash_desc *desc, u8 *out) { u32 *crcp = shash_desc_ctx(desc); put_unaligned_le32(*crcp, out); return 0; } static int crc32_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, out); } static int crc32_digest_arch(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crc32_finup_arch(crypto_shash_ctx(desc->tfm), data, len, out); } static struct shash_alg algs[] = {{ .setkey = crc32_setkey, .init = crc32_init, .update = crc32_update, .final = crc32_final, .finup = crc32_finup, .digest = crc32_digest, .descsize = sizeof(u32), .digestsize = CHKSUM_DIGEST_SIZE, .base.cra_name = "crc32", .base.cra_driver_name = "crc32-generic", .base.cra_priority = 100, .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .base.cra_blocksize = CHKSUM_BLOCK_SIZE, .base.cra_ctxsize = sizeof(u32), .base.cra_module = THIS_MODULE, .base.cra_init = crc32_cra_init, }, { .setkey = crc32_setkey, .init = crc32_init, .update = crc32_update_arch, .final = crc32_final, .finup = crc32_finup_arch, .digest = crc32_digest_arch, .descsize = sizeof(u32), .digestsize = CHKSUM_DIGEST_SIZE, .base.cra_name = "crc32", .base.cra_driver_name = "crc32-" __stringify(ARCH), .base.cra_priority = 150, .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .base.cra_blocksize = CHKSUM_BLOCK_SIZE, .base.cra_ctxsize = sizeof(u32), .base.cra_module = THIS_MODULE, .base.cra_init = crc32_cra_init, }}; static int __init crc32_mod_init(void) { /* register the arch flavor only if it differs from the generic one */ return crypto_register_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); } static void __exit crc32_mod_fini(void) { crypto_unregister_shashes(algs, 1 + (&crc32_le != &crc32_le_base)); } subsys_initcall(crc32_mod_init); module_exit(crc32_mod_fini); MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); MODULE_DESCRIPTION("CRC32 calculations wrapper for lib/crc32"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("crc32"); MODULE_ALIAS_CRYPTO("crc32-generic"); |
5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_WRAPPER_H #define __NET_TC_WRAPPER_H #include <net/pkt_cls.h> #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) #include <linux/cpufeature.h> #include <linux/static_key.h> #include <linux/indirect_call_wrapper.h> #define TC_INDIRECT_SCOPE extern struct static_key_false tc_skip_wrapper; /* TC Actions */ #ifdef CONFIG_NET_CLS_ACT #define TC_INDIRECT_ACTION_DECLARE(fname) \ INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ const struct tc_action *a, \ struct tcf_result *res)) TC_INDIRECT_ACTION_DECLARE(tcf_bpf_act); TC_INDIRECT_ACTION_DECLARE(tcf_connmark_act); TC_INDIRECT_ACTION_DECLARE(tcf_csum_act); TC_INDIRECT_ACTION_DECLARE(tcf_ct_act); TC_INDIRECT_ACTION_DECLARE(tcf_ctinfo_act); TC_INDIRECT_ACTION_DECLARE(tcf_gact_act); TC_INDIRECT_ACTION_DECLARE(tcf_gate_act); TC_INDIRECT_ACTION_DECLARE(tcf_ife_act); TC_INDIRECT_ACTION_DECLARE(tcf_ipt_act); TC_INDIRECT_ACTION_DECLARE(tcf_mirred_act); TC_INDIRECT_ACTION_DECLARE(tcf_mpls_act); TC_INDIRECT_ACTION_DECLARE(tcf_nat_act); TC_INDIRECT_ACTION_DECLARE(tcf_pedit_act); TC_INDIRECT_ACTION_DECLARE(tcf_police_act); TC_INDIRECT_ACTION_DECLARE(tcf_sample_act); TC_INDIRECT_ACTION_DECLARE(tcf_simp_act); TC_INDIRECT_ACTION_DECLARE(tcf_skbedit_act); TC_INDIRECT_ACTION_DECLARE(tcf_skbmod_act); TC_INDIRECT_ACTION_DECLARE(tcf_vlan_act); TC_INDIRECT_ACTION_DECLARE(tunnel_key_act); static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { if (static_branch_likely(&tc_skip_wrapper)) goto skip; #if IS_BUILTIN(CONFIG_NET_ACT_GACT) if (a->ops->act == tcf_gact_act) return tcf_gact_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_MIRRED) if (a->ops->act == tcf_mirred_act) return tcf_mirred_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_PEDIT) if (a->ops->act == tcf_pedit_act) return tcf_pedit_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) if (a->ops->act == tcf_skbedit_act) return tcf_skbedit_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) if (a->ops->act == tcf_skbmod_act) return tcf_skbmod_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_POLICE) if (a->ops->act == tcf_police_act) return tcf_police_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_BPF) if (a->ops->act == tcf_bpf_act) return tcf_bpf_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) if (a->ops->act == tcf_connmark_act) return tcf_connmark_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_CSUM) if (a->ops->act == tcf_csum_act) return tcf_csum_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_CT) if (a->ops->act == tcf_ct_act) return tcf_ct_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_CTINFO) if (a->ops->act == tcf_ctinfo_act) return tcf_ctinfo_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_GATE) if (a->ops->act == tcf_gate_act) return tcf_gate_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_MPLS) if (a->ops->act == tcf_mpls_act) return tcf_mpls_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_NAT) if (a->ops->act == tcf_nat_act) return tcf_nat_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) if (a->ops->act == tunnel_key_act) return tunnel_key_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_VLAN) if (a->ops->act == tcf_vlan_act) return tcf_vlan_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_IFE) if (a->ops->act == tcf_ife_act) return tcf_ife_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_SIMP) if (a->ops->act == tcf_simp_act) return tcf_simp_act(skb, a, res); #endif #if IS_BUILTIN(CONFIG_NET_ACT_SAMPLE) if (a->ops->act == tcf_sample_act) return tcf_sample_act(skb, a, res); #endif skip: return a->ops->act(skb, a, res); } #endif /* CONFIG_NET_CLS_ACT */ /* TC Filters */ #ifdef CONFIG_NET_CLS #define TC_INDIRECT_FILTER_DECLARE(fname) \ INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ const struct tcf_proto *tp, \ struct tcf_result *res)) TC_INDIRECT_FILTER_DECLARE(basic_classify); TC_INDIRECT_FILTER_DECLARE(cls_bpf_classify); TC_INDIRECT_FILTER_DECLARE(cls_cgroup_classify); TC_INDIRECT_FILTER_DECLARE(fl_classify); TC_INDIRECT_FILTER_DECLARE(flow_classify); TC_INDIRECT_FILTER_DECLARE(fw_classify); TC_INDIRECT_FILTER_DECLARE(mall_classify); TC_INDIRECT_FILTER_DECLARE(route4_classify); TC_INDIRECT_FILTER_DECLARE(u32_classify); static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { if (static_branch_likely(&tc_skip_wrapper)) goto skip; #if IS_BUILTIN(CONFIG_NET_CLS_BPF) if (tp->classify == cls_bpf_classify) return cls_bpf_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_U32) if (tp->classify == u32_classify) return u32_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_FLOWER) if (tp->classify == fl_classify) return fl_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_FW) if (tp->classify == fw_classify) return fw_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) if (tp->classify == mall_classify) return mall_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_BASIC) if (tp->classify == basic_classify) return basic_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_CGROUP) if (tp->classify == cls_cgroup_classify) return cls_cgroup_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_FLOW) if (tp->classify == flow_classify) return flow_classify(skb, tp, res); #endif #if IS_BUILTIN(CONFIG_NET_CLS_ROUTE4) if (tp->classify == route4_classify) return route4_classify(skb, tp, res); #endif skip: return tp->classify(skb, tp, res); } #endif /* CONFIG_NET_CLS */ static inline void tc_wrapper_init(void) { #ifdef CONFIG_X86 if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) static_branch_enable(&tc_skip_wrapper); #endif } #else #define TC_INDIRECT_SCOPE static static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { return a->ops->act(skb, a, res); } static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { return tp->classify(skb, tp, res); } static inline void tc_wrapper_init(void) { } #endif #endif /* __NET_TC_WRAPPER_H */ |
3 3 3 2 2 2 1 1 3 1 1 1 8 4 4 4 3 3 1 4 3 12 12 13 4 9 2 1 8 10 12 1 3 7 2 9 1 2 1 3 6 9 4 2 2 2 1 3 5 5 5 1 1 11 1 1 1 1 1 6 1 5 1 4 1 1 1 1 1 5 3 2 2 2 2 1 2 5 1 2 2 7 7 6 4 2 5 5 1 5 1 5 3 3 12 8 5 4 7 9 1 9 1 2 2 2 2 3 1 1 1 1 21 1 19 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 13 1 1 1 1 1 2 1 1 1 1 1 1 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 | /* * af_llc.c - LLC User Interface SAPs * Description: * Functions in this module are implementation of socket based llc * communications for the Linux operating system. Support of llc class * one and class two is provided via SOCK_DGRAM and SOCK_STREAM * respectively. * * An llc2 connection is (mac + sap), only one llc2 sap connection * is allowed per mac. Though one sap may have multiple mac + sap * connections. * * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org> * 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_pdu.h> #include <net/llc_conn.h> #include <net/tcp_states.h> /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; static bool llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) do {} while (0) #endif /* Maybe we'll add some more in the future. */ #define LLC_CMSG_PKTINFO 1 /** * llc_ui_next_link_no - return the next unused link number for a sap * @sap: Address of sap to get link number from. * * Return the next unused link number for a given sap. */ static inline u16 llc_ui_next_link_no(int sap) { return llc_ui_sap_link_no_max[sap]++; } /** * llc_proto_type - return eth protocol for ARP header type * @arphrd: ARP header type. * * Given an ARP header type return the corresponding ethernet protocol. */ static inline __be16 llc_proto_type(u16 arphrd) { return htons(ETH_P_802_2); } /** * llc_ui_addr_null - determines if a address structure is null * @addr: Address to test if null. */ static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr) { return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr)); } /** * llc_ui_header_len - return length of llc header based on operation * @sk: Socket which contains a valid llc socket type. * @addr: Complete sockaddr_llc structure received from the user. * * Provide the length of the llc header depending on what kind of * operation the user would like to perform and the type of socket. * Returns the correct llc header length. */ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; if (addr->sllc_test) rc = LLC_PDU_LEN_U; else if (addr->sllc_xid) /* We need to expand header to sizeof(struct llc_xid_info) * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header * as XID PDU. In llc_ui_sendmsg() we reserved header size and then * filled all other space with user data. If we won't reserve this * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data */ rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; } /** * llc_ui_send_data - send data via reliable llc2 connection * @sk: Connection the socket is using. * @skb: Data the user wishes to send. * @noblock: can we block waiting for data? * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. * * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); if (rc) { kfree_skb(skb); return rc; } } return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) { sock_graft(sk, sock); sk->sk_type = sock->type; sock->ops = &llc_ui_ops; } static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** * llc_ui_create - alloc and init a new llc_ui socket * @net: network namespace (must be default network) * @sock: Socket to initialize and attach allocated sk to. * @protocol: Unused. * @kern: on behalf of kernel or userspace * * Allocate and initialize a new llc_ui socket, validate the user wants a * socket type we have available. * Returns 0 upon success, negative upon failure. */ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc = -ESOCKTNOSUPPORT; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); } } return rc; } /** * llc_ui_release - shutdown socket * @sock: Socket to release. * * Shutdown and deallocate an existing socket. */ static int llc_ui_release(struct socket *sock) { struct sock *sk = sock->sk; struct llc_sock *llc; if (unlikely(sk == NULL)) goto out; sock_hold(sk); lock_sock(sk); llc = llc_sk(sk); dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); if (!sock_flag(sk, SOCK_ZAPPED)) { struct llc_sap *sap = llc->sap; /* Hold this for release_sock(), so that llc_backlog_rcv() * could still use it. */ llc_sap_hold(sap); llc_sap_remove_socket(llc->sap, sk); release_sock(sk); llc_sap_put(sap); } else { release_sock(sk); } netdev_put(llc->dev, &llc->dev_tracker); sock_put(sk); sock_orphan(sk); sock->sk = NULL; llc_sk_free(sk); out: return 0; } /** * llc_ui_autoport - provide dynamically allocate SAP number * * Provide the caller with a dynamically allocated SAP number according * to the rules that are set in this function. Returns: 0, upon failure, * SAP number otherwise. */ static int llc_ui_autoport(void) { struct llc_sap *sap; int i, tries = 0; while (tries < LLC_SAP_DYN_TRIES) { for (i = llc_ui_sap_last_autoport; i < LLC_SAP_DYN_STOP; i += 2) { sap = llc_sap_find(i); if (!sap) { llc_ui_sap_last_autoport = i + 2; goto out; } llc_sap_put(sap); } llc_ui_sap_last_autoport = LLC_SAP_DYN_START; tries++; } i = 0; out: return i; } /** * llc_ui_autobind - automatically bind a socket to a sap * @sock: socket to bind * @addr: address to connect to * * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't * specifically used llc_ui_bind to bind to an specific address/sap * * Returns: 0 upon success, negative otherwise. */ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (addr->sllc_arphrd != ARPHRD_ETHER) goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); if (dev && addr->sllc_arphrd != dev->type) { dev_put(dev); dev = NULL; } } else dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); if (!dev) goto out; rc = -EUSERS; llc->laddr.lsap = llc_ui_autoport(); if (!llc->laddr.lsap) goto out; rc = -EBUSY; /* some other network layer is using the sap */ sap = llc_sap_open(llc->laddr.lsap, NULL); if (!sap) goto out; /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: dev_put(dev); return rc; } /** * llc_ui_bind - bind a socket to a specific address. * @sock: Socket to bind an address to. * @uaddr: Address the user wants the socket bound to. * @addrlen: Length of the uaddr structure. * * Bind a socket to a specific address. For llc a user is able to bind to * a specific sap only or mac + sap. * If the user desires to bind to a specific mac + sap, it is possible to * have multiple sap connections via multiple macs. * Bind and autobind for that matter must enforce the correct sap usage * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER)) goto out; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (dev) { if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, dev->dev_addr, IFHWADDRLEN); if (addr->sllc_arphrd != dev->type || !ether_addr_equal(addr->sllc_mac, dev->dev_addr)) { rc = -EINVAL; dev = NULL; } } } else { dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); } dev_hold(dev); rcu_read_unlock(); if (!dev) goto out; if (!addr->sllc_sap) { rc = -EUSERS; addr->sllc_sap = llc_ui_autoport(); if (!addr->sllc_sap) goto out; } sap = llc_sap_find(addr->sllc_sap); if (!sap) { sap = llc_sap_open(addr->sllc_sap, NULL); rc = -EBUSY; /* some other network layer is using the sap */ if (!sap) goto out; } else { struct llc_addr laddr, daddr; struct sock *ask; memset(&laddr, 0, sizeof(laddr)); memset(&daddr, 0, sizeof(daddr)); /* * FIXME: check if the address is multicast, * only SOCK_DGRAM can do this. */ memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ ask = llc_lookup_established(sap, &daddr, &laddr, &init_net); if (ask) { sock_put(ask); goto out_put; } } /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; llc->laddr.lsap = addr->sllc_sap; memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out_put: llc_sap_put(sap); out: dev_put(dev); release_sock(sk); return rc; } /** * llc_ui_shutdown - shutdown a connect llc2 socket. * @sock: Socket to shutdown. * @how: What part of the socket to shutdown. * * Shutdown a connected llc2 socket. Currently this function only supports * shutting down both sends and receives (2), we could probably make this * function such that a user can shutdown only half the connection but not * right now. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int rc = -ENOTCONN; lock_sock(sk); if (unlikely(sk->sk_state != TCP_ESTABLISHED)) goto out; rc = -EINVAL; if (how != 2) goto out; rc = llc_send_disc(sk); if (!rc) rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); out: release_sock(sk); return rc; } /** * llc_ui_connect - Connect to a remote llc2 mac + sap. * @sock: Socket which will be connected to the remote destination. * @uaddr: Remote and possibly the local address of the new connection. * @addrlen: Size of uaddr structure. * @flags: Operational flags specified by the user. * * Connect to a remote llc2 mac + sap. The caller must specify the * destination mac and address to connect to. If the user hasn't previously * called bind(2) with a smac the address of the first interface of the * specified arp type will be used. * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; int rc = -EINVAL; lock_sock(sk); if (unlikely(addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (unlikely(addr->sllc_family != AF_LLC)) goto out; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EALREADY; if (unlikely(sock->state == SS_CONNECTING)) goto out; /* bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } llc->daddr.lsap = addr->sllc_sap; memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); rc = llc_establish_connection(sk, llc->dev->dev_addr, addr->sllc_mac, addr->sllc_sap); if (rc) { dprintk("%s: llc_ui_send_conn failed :-(\n", __func__); sock->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; goto out; } if (sk->sk_state == TCP_SYN_SENT) { const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) goto out; rc = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } if (sk->sk_state == TCP_CLOSE) goto sock_error; sock->state = SS_CONNECTED; rc = 0; out: release_sock(sk); return rc; sock_error: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; } /** * llc_ui_listen - allow a normal socket to accept incoming connections * @sock: Socket to allow incoming connections on. * @backlog: Number of connections to queue. * * Allow a normal socket to accept incoming connections. * Returns 0 upon success, negative otherwise. */ static int llc_ui_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EINVAL; lock_sock(sk); if (unlikely(sock->state != SS_UNCONNECTED)) goto out; rc = -EOPNOTSUPP; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EAGAIN; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = 0; if (!(unsigned int)backlog) /* BSDism */ backlog = 1; sk->sk_max_ack_backlog = backlog; if (sk->sk_state != TCP_LISTEN) { sk->sk_ack_backlog = 0; sk->sk_state = TCP_LISTEN; } sk->sk_socket->flags |= __SO_ACCEPTCON; out: release_sock(sk); return rc; } static int llc_ui_wait_for_disc(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return timeout; } static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct llc_sock *llc = llc_sk(sk); int rc; add_wait_queue(sk_sleep(sk), &wait); while (1) { rc = 0; if (sk_wait_event(sk, &timeout, (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int llc_wait_data(struct sock *sk, long timeo) { int rc; while (1) { /* * POSIX 1003.1g mandates this order. */ rc = sock_error(sk); if (rc) break; rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -EAGAIN; if (!timeo) break; rc = sock_intr_errno(timeo); if (signal_pending(current)) break; rc = 0; if (sk_wait_data(sk, &timeo, NULL)) break; } return rc; } static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(skb->sk); if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { struct llc_pktinfo info; memset(&info, 0, sizeof(info)); info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; llc_pdu_decode_dsap(skb, &info.lpi_sap); llc_pdu_decode_da(skb, info.lpi_mac); put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); } } /** * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. * @arg: User specified arguments * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ static int llc_ui_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; struct sk_buff *skb; int rc = -EOPNOTSUPP; dprintk("%s: accepting on %02X\n", __func__, llc_sk(sk)->laddr.lsap); lock_sock(sk); if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EINVAL; if (unlikely(sock->state != SS_UNCONNECTED || sk->sk_state != TCP_LISTEN)) goto out; /* wait for a connection to arrive. */ if (skb_queue_empty(&sk->sk_receive_queue)) { rc = llc_wait_data(sk, sk->sk_rcvtimeo); if (rc) goto out; } dprintk("%s: got a new connection on %02X\n", __func__, llc_sk(sk)->laddr.lsap); skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto frees; rc = 0; newsk = skb->sk; /* attach connection to a new socket. */ llc_ui_sk_init(newsock, newsk); sock_reset_flag(newsk, SOCK_ZAPPED); newsk->sk_state = TCP_ESTABLISHED; newsock->state = SS_CONNECTED; llc = llc_sk(sk); newllc = llc_sk(newsk); memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: kfree_skb(skb); out: release_sock(sk); return rc; } /** * llc_ui_recvmsg - copy received data to the socket user. * @sock: Socket to copy data from. * @msg: Various user space related information. * @len: Size of user buffer. * @flags: User specified flags. * * Copy received data to the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name); const int nonblock = flags & MSG_DONTWAIT; struct sk_buff *skb = NULL; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); size_t copied = 0; u32 peek_seq = 0; u32 *seq, skb_len; unsigned long used; int target; /* Read at least this many bytes */ long timeo; lock_sock(sk); copied = -ENOTCONN; if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); seq = &llc->copied_seq; if (flags & MSG_PEEK) { peek_seq = llc->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); copied = 0; do { u32 offset; /* * We need to check signals first, to get correct SIGURG * handling. FIXME: Need to check this doesn't impact 1003.1g * and move it down to the bottom of the loop */ if (signal_pending(current)) { if (copied) break; copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } /* Next get a buffer. */ skb = skb_peek(&sk->sk_receive_queue); if (skb) { offset = *seq; goto found_ok_skb; } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || (flags & MSG_PEEK)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } break; } if (!timeo) { copied = -EAGAIN; break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); } else sk_wait_data(sk, &timeo, NULL); if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = llc->copied_seq; } continue; found_ok_skb: skb_len = skb->len; /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; if (!(flags & MSG_TRUNC)) { int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } *seq += used; copied += used; len -= used; /* For non stream protcols we get one packet per recvmsg call */ if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } /* Partial read */ if (used + offset < skb_len) continue; } while (len > 0); out: release_sock(sk); return copied; copy_uaddr: if (uaddr != NULL && skb != NULL) { memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); } if (llc_sk(sk)->cmsg_flags) llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } goto out; } /** * llc_ui_sendmsg - Transmit data provided by the socket user. * @sock: Socket to transmit data from. * @msg: Various user related information. * @len: Length of data to transmit. * * Transmit data provided by the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; int rc = -EINVAL, copied = 0, hdrlen, hh_len; struct sk_buff *skb = NULL; struct net_device *dev; size_t size = 0; dprintk("%s: sending from %02X to %02X\n", __func__, llc->laddr.lsap, llc->daddr.lsap); lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) goto out; } else { if (llc_ui_addr_null(&llc->addr)) goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } dev = llc->dev; hh_len = LL_RESERVED_SPACE(dev); hdrlen = llc_ui_header_len(sk, addr); size = hdrlen + len; size = min_t(size_t, size, READ_ONCE(dev->mtu)); copied = size - hdrlen; rc = -EINVAL; if (copied < 0) goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); lock_sock(sk); if (!skb) goto out; if (sock_flag(sk, SOCK_ZAPPED) || llc->dev != dev || hdrlen != llc_ui_header_len(sk, addr) || hh_len != LL_RESERVED_SPACE(dev) || size > READ_ONCE(dev->mtu)) goto out; skb->dev = dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hh_len + hdrlen); rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); skb = NULL; out: kfree_skb(skb); if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); release_sock(sk); return rc ? : copied; } /** * llc_ui_getname - return the address info of a socket * @sock: Socket to get address of. * @uaddr: Address structure to return information. * @peer: Does user want local or remote address information. * * Return the address information of a socket. */ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; if(llc->dev) sllc.sllc_arphrd = llc->dev->type; sllc.sllc_sap = llc->daddr.lsap; memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); } else { rc = -EINVAL; if (!llc->sap) goto out; sllc.sllc_sap = llc->sap->laddr.lsap; if (llc->dev) { sllc.sllc_arphrd = llc->dev->type; memcpy(&sllc.sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); } } sllc.sllc_family = AF_LLC; memcpy(uaddr, &sllc, sizeof(sllc)); rc = sizeof(sllc); out: release_sock(sk); return rc; } /** * llc_ui_ioctl - io controls for PF_LLC * @sock: Socket to get/set info * @cmd: command * @arg: optional argument for cmd * * get/set info on llc sockets */ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -ENOIOCTLCMD; } /** * llc_ui_setsockopt - set various connection specific parameters. * @sock: Socket to set options on. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: User provided operation data. * @optlen: Length of optval. * * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); unsigned int opt; int rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; rc = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (rc) goto out; rc = -EINVAL; switch (optname) { case LLC_OPT_RETRY: if (opt > LLC_OPT_MAX_RETRY) goto out; llc->n2 = opt; break; case LLC_OPT_SIZE: if (opt > LLC_OPT_MAX_SIZE) goto out; llc->n1 = opt; break; case LLC_OPT_ACK_TMR_EXP: if (opt > LLC_OPT_MAX_ACK_TMR_EXP) goto out; llc->ack_timer.expire = opt * HZ; break; case LLC_OPT_P_TMR_EXP: if (opt > LLC_OPT_MAX_P_TMR_EXP) goto out; llc->pf_cycle_timer.expire = opt * HZ; break; case LLC_OPT_REJ_TMR_EXP: if (opt > LLC_OPT_MAX_REJ_TMR_EXP) goto out; llc->rej_sent_timer.expire = opt * HZ; break; case LLC_OPT_BUSY_TMR_EXP: if (opt > LLC_OPT_MAX_BUSY_TMR_EXP) goto out; llc->busy_state_timer.expire = opt * HZ; break; case LLC_OPT_TX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->k = opt; break; case LLC_OPT_RX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->rw = opt; break; case LLC_OPT_PKTINFO: if (opt) llc->cmsg_flags |= LLC_CMSG_PKTINFO; else llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; out: release_sock(sk); return rc; } /** * llc_ui_getsockopt - get connection specific socket info * @sock: Socket to get information from. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: Variable to return operation data in. * @optlen: Length of optval. * * Get connection specific socket information. */ static int llc_ui_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int val = 0, len = 0, rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC)) goto out; rc = get_user(len, optlen); if (rc) goto out; rc = -EINVAL; if (len != sizeof(int)) goto out; switch (optname) { case LLC_OPT_RETRY: val = llc->n2; break; case LLC_OPT_SIZE: val = llc->n1; break; case LLC_OPT_ACK_TMR_EXP: val = llc->ack_timer.expire / HZ; break; case LLC_OPT_P_TMR_EXP: val = llc->pf_cycle_timer.expire / HZ; break; case LLC_OPT_REJ_TMR_EXP: val = llc->rej_sent_timer.expire / HZ; break; case LLC_OPT_BUSY_TMR_EXP: val = llc->busy_state_timer.expire / HZ; break; case LLC_OPT_TX_WIN: val = llc->k; break; case LLC_OPT_RX_WIN: val = llc->rw; break; case LLC_OPT_PKTINFO: val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; if (put_user(len, optlen) || copy_to_user(optval, &val, len)) rc = -EFAULT; out: release_sock(sk); return rc; } static const struct net_proto_family llc_ui_family_ops = { .family = PF_LLC, .create = llc_ui_create, .owner = THIS_MODULE, }; static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, .bind = llc_ui_bind, .connect = llc_ui_connect, .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, .setsockopt = llc_ui_setsockopt, .getsockopt = llc_ui_getsockopt, .sendmsg = llc_ui_sendmsg, .recvmsg = llc_ui_recvmsg, .mmap = sock_no_mmap, }; static const char llc_proc_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; static const char llc_sysctl_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the sysctl entries\n"; static const char llc_sock_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the network family\n"; static int __init llc2_init(void) { int rc = proto_register(&llc_proto, 0); if (rc != 0) goto out; llc_build_offset_table(); llc_station_init(); llc_ui_sap_last_autoport = LLC_SAP_DYN_START; rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); goto out_station; } rc = llc_sysctl_init(); if (rc) { printk(llc_sysctl_err_msg); goto out_proc; } rc = sock_register(&llc_ui_family_ops); if (rc) { printk(llc_sock_err_msg); goto out_sysctl; } llc_add_pack(LLC_DEST_SAP, llc_sap_handler); llc_add_pack(LLC_DEST_CONN, llc_conn_handler); out: return rc; out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); out_station: llc_station_exit(); proto_unregister(&llc_proto); goto out; } static void __exit llc2_exit(void) { llc_station_exit(); llc_remove_pack(LLC_DEST_SAP); llc_remove_pack(LLC_DEST_CONN); sock_unregister(PF_LLC); llc_proc_exit(); llc_sysctl_exit(); proto_unregister(&llc_proto); } module_init(llc2_init); module_exit(llc2_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support"); MODULE_ALIAS_NETPROTO(PF_LLC); |
862 62 38 189 29 29 488 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 | // SPDX-License-Identifier: GPL-2.0 /* * device.h - generic, centralized driver model * * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2009 Novell Inc. * * See Documentation/driver-api/driver-model/ for more information. */ #ifndef _DEVICE_H_ #define _DEVICE_H_ #include <linux/dev_printk.h> #include <linux/energy_model.h> #include <linux/ioport.h> #include <linux/kobject.h> #include <linux/klist.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/compiler.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/gfp.h> #include <linux/overflow.h> #include <linux/device/bus.h> #include <linux/device/class.h> #include <linux/device/driver.h> #include <linux/cleanup.h> #include <asm/device.h> struct device; struct device_private; struct device_driver; struct driver_private; struct module; struct class; struct subsys_private; struct device_node; struct fwnode_handle; struct iommu_group; struct dev_pin_info; struct dev_iommu; struct msi_device_data; /** * struct subsys_interface - interfaces to device functions * @name: name of the device function * @subsys: subsystem of the devices to attach to * @node: the list of functions registered at the subsystem * @add_dev: device hookup to device function handler * @remove_dev: device hookup to device function handler * * Simple interfaces attached to a subsystem. Multiple interfaces can * attach to a subsystem and its devices. Unlike drivers, they do not * exclusively claim or control devices. Interfaces usually represent * a specific functionality of a subsystem/class of devices. */ struct subsys_interface { const char *name; const struct bus_type *subsys; struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); void (*remove_dev)(struct device *dev, struct subsys_interface *sif); }; int subsys_interface_register(struct subsys_interface *sif); void subsys_interface_unregister(struct subsys_interface *sif); int subsys_system_register(const struct bus_type *subsys, const struct attribute_group **groups); int subsys_virtual_register(const struct bus_type *subsys, const struct attribute_group **groups); /* * The type of device, "struct device" is embedded in. A class * or bus can contain devices of different types * like "partitions" and "disks", "mouse" and "event". * This identifies the device type and carries type-specific * information, equivalent to the kobj_type of a kobject. * If "name" is specified, the uevent will contain it in * the DEVTYPE variable. */ struct device_type { const char *name; const struct attribute_group **groups; int (*uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid); void (*release)(struct device *dev); const struct dev_pm_ops *pm; }; /** * struct device_attribute - Interface for exporting device attributes. * @attr: sysfs attribute definition. * @show: Show handler. * @store: Store handler. */ struct device_attribute { struct attribute attr; ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf); ssize_t (*store)(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); }; /** * struct dev_ext_attribute - Exported device attribute with extra context. * @attr: Exported device attribute. * @var: Pointer to context. */ struct dev_ext_attribute { struct device_attribute attr; void *var; }; ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_int(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_int(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_bool(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_bool(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_string(struct device *dev, struct device_attribute *attr, char *buf); /** * DEVICE_ATTR - Define a device attribute. * @_name: Attribute name. * @_mode: File mode. * @_show: Show handler. Optional, but mandatory if attribute is readable. * @_store: Store handler. Optional, but mandatory if attribute is writable. * * Convenience macro for defining a struct device_attribute. * * For example, ``DEVICE_ATTR(foo, 0644, foo_show, foo_store);`` expands to: * * .. code-block:: c * * struct device_attribute dev_attr_foo = { * .attr = { .name = "foo", .mode = 0644 }, * .show = foo_show, * .store = foo_store, * }; */ #define DEVICE_ATTR(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) /** * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute. * @_name: Attribute name. * @_mode: File mode. * @_show: Show handler. Optional, but mandatory if attribute is readable. * @_store: Store handler. Optional, but mandatory if attribute is writable. * * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode. */ #define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_PREALLOC(_name, _mode, _show, _store) /** * DEVICE_ATTR_RW - Define a read-write device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0644, @_show is <_name>_show, * and @_store is <_name>_store. */ #define DEVICE_ATTR_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW(_name) /** * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR_RW(), but @_mode is 0600. */ #define DEVICE_ATTR_ADMIN_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) /** * DEVICE_ATTR_RO - Define a readable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show. */ #define DEVICE_ATTR_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO(_name) /** * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR_RO(), but @_mode is 0400. */ #define DEVICE_ATTR_ADMIN_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) /** * DEVICE_ATTR_WO - Define an admin-only writable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store. */ #define DEVICE_ATTR_WO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_WO(_name) /** * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of unsigned long. * * Like DEVICE_ATTR(), but @_show and @_store are automatically provided * such that reads and writes to the attribute from userspace affect @_var. */ #define DEVICE_ULONG_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) } /** * DEVICE_INT_ATTR - Define a device attribute backed by an int. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of int. * * Like DEVICE_ULONG_ATTR(), but @_var is an int. */ #define DEVICE_INT_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) } /** * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of bool. * * Like DEVICE_ULONG_ATTR(), but @_var is a bool. */ #define DEVICE_BOOL_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) } /** * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of string. * * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the * string allocation is unknown, the attribute must be read-only. */ #define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) } #define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) int device_create_file(struct device *device, const struct device_attribute *entry); void device_remove_file(struct device *dev, const struct device_attribute *attr); bool device_remove_file_self(struct device *dev, const struct device_attribute *attr); int __must_check device_create_bin_file(struct device *dev, const struct bin_attribute *attr); void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); /* device resource management */ typedef void (*dr_release_t)(struct device *dev, void *res); typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data); void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid, const char *name) __malloc; #define devres_alloc(release, size, gfp) \ __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release) #define devres_alloc_node(release, size, gfp, nid) \ __devres_alloc_node(release, size, gfp, nid, #release) void devres_for_each_res(struct device *dev, dr_release_t release, dr_match_t match, void *match_data, void (*fn)(struct device *, void *, void *), void *data); void devres_free(void *res); void devres_add(struct device *dev, void *res); void *devres_find(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); void *devres_get(struct device *dev, void *new_res, dr_match_t match, void *match_data); void *devres_remove(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); int devres_destroy(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); int devres_release(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); /* devres group */ void * __must_check devres_open_group(struct device *dev, void *id, gfp_t gfp); void devres_close_group(struct device *dev, void *id); void devres_remove_group(struct device *dev, void *id); int devres_release_group(struct device *dev, void *id); /* managed devm_k.alloc/kfree for device drivers */ void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __alloc_size(2); void *devm_krealloc(struct device *dev, void *ptr, size_t size, gfp_t gfp) __must_check __realloc_size(3); __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) __malloc; __printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc; static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp) { return devm_kmalloc(dev, size, gfp | __GFP_ZERO); } static inline void *devm_kmalloc_array(struct device *dev, size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return devm_kmalloc(dev, bytes, flags); } static inline void *devm_kcalloc(struct device *dev, size_t n, size_t size, gfp_t flags) { return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO); } static inline __realloc_size(3, 4) void * __must_check devm_krealloc_array(struct device *dev, void *p, size_t new_n, size_t new_size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) return NULL; return devm_krealloc(dev, p, bytes, flags); } void devm_kfree(struct device *dev, const void *p); char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc; const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp); void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) __realloc_size(3); unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order); void devm_free_pages(struct device *dev, unsigned long addr); #ifdef CONFIG_HAS_IOMEM void __iomem *devm_ioremap_resource(struct device *dev, const struct resource *res); void __iomem *devm_ioremap_resource_wc(struct device *dev, const struct resource *res); void __iomem *devm_of_iomap(struct device *dev, struct device_node *node, int index, resource_size_t *size); #else static inline void __iomem *devm_ioremap_resource(struct device *dev, const struct resource *res) { return ERR_PTR(-EINVAL); } static inline void __iomem *devm_ioremap_resource_wc(struct device *dev, const struct resource *res) { return ERR_PTR(-EINVAL); } static inline void __iomem *devm_of_iomap(struct device *dev, struct device_node *node, int index, resource_size_t *size) { return ERR_PTR(-EINVAL); } #endif /* allows to add/remove a custom action to devres stack */ void devm_remove_action(struct device *dev, void (*action)(void *), void *data); void devm_release_action(struct device *dev, void (*action)(void *), void *data); int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name); #define devm_add_action(dev, action, data) \ __devm_add_action(dev, action, data, #action) static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *), void *data, const char *name) { int ret; ret = __devm_add_action(dev, action, data, name); if (ret) action(data); return ret; } #define devm_add_action_or_reset(dev, action, data) \ __devm_add_action_or_reset(dev, action, data, #action) /** * devm_alloc_percpu - Resource-managed alloc_percpu * @dev: Device to allocate per-cpu memory for * @type: Type to allocate per-cpu memory for * * Managed alloc_percpu. Per-cpu memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ #define devm_alloc_percpu(dev, type) \ ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \ __alignof__(type))) void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align); void devm_free_percpu(struct device *dev, void __percpu *pdata); struct device_dma_parameters { /* * a low level driver may set these to teach IOMMU code about * sg limitations. */ unsigned int max_segment_size; unsigned int min_align_mask; unsigned long segment_boundary_mask; }; /** * enum device_link_state - Device link states. * @DL_STATE_NONE: The presence of the drivers is not being tracked. * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present. * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not. * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present). * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present. * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding. */ enum device_link_state { DL_STATE_NONE = -1, DL_STATE_DORMANT = 0, DL_STATE_AVAILABLE, DL_STATE_CONSUMER_PROBE, DL_STATE_ACTIVE, DL_STATE_SUPPLIER_UNBIND, }; /* * Device link flags. * * STATELESS: The core will not remove this link automatically. * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind. * PM_RUNTIME: If set, the runtime PM framework will use this link. * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation. * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind. * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds. * MANAGED: The core tracks presence of supplier/consumer drivers (internal). * SYNC_STATE_ONLY: Link only affects sync_state() behavior. * INFERRED: Inferred from data (eg: firmware) and not from driver actions. */ #define DL_FLAG_STATELESS BIT(0) #define DL_FLAG_AUTOREMOVE_CONSUMER BIT(1) #define DL_FLAG_PM_RUNTIME BIT(2) #define DL_FLAG_RPM_ACTIVE BIT(3) #define DL_FLAG_AUTOREMOVE_SUPPLIER BIT(4) #define DL_FLAG_AUTOPROBE_CONSUMER BIT(5) #define DL_FLAG_MANAGED BIT(6) #define DL_FLAG_SYNC_STATE_ONLY BIT(7) #define DL_FLAG_INFERRED BIT(8) #define DL_FLAG_CYCLE BIT(9) /** * enum dl_dev_state - Device driver presence tracking information. * @DL_DEV_NO_DRIVER: There is no driver attached to the device. * @DL_DEV_PROBING: A driver is probing. * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device. * @DL_DEV_UNBINDING: The driver is unbinding from the device. */ enum dl_dev_state { DL_DEV_NO_DRIVER = 0, DL_DEV_PROBING, DL_DEV_DRIVER_BOUND, DL_DEV_UNBINDING, }; /** * enum device_removable - Whether the device is removable. The criteria for a * device to be classified as removable is determined by its subsystem or bus. * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this * device (default). * @DEVICE_REMOVABLE_UNKNOWN: Device location is Unknown. * @DEVICE_FIXED: Device is not removable by the user. * @DEVICE_REMOVABLE: Device is removable by the user. */ enum device_removable { DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */ DEVICE_REMOVABLE_UNKNOWN, DEVICE_FIXED, DEVICE_REMOVABLE, }; /** * struct dev_links_info - Device data related to device links. * @suppliers: List of links to supplier devices. * @consumers: List of links to consumer devices. * @defer_sync: Hook to global list of devices that have deferred sync_state. * @status: Driver status information. */ struct dev_links_info { struct list_head suppliers; struct list_head consumers; struct list_head defer_sync; enum dl_dev_state status; }; /** * struct dev_msi_info - Device data related to MSI * @domain: The MSI interrupt domain associated to the device * @data: Pointer to MSI device data */ struct dev_msi_info { #ifdef CONFIG_GENERIC_MSI_IRQ struct irq_domain *domain; struct msi_device_data *data; #endif }; /** * enum device_physical_location_panel - Describes which panel surface of the * system's housing the device connection point resides on. * @DEVICE_PANEL_TOP: Device connection point is on the top panel. * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel. * @DEVICE_PANEL_LEFT: Device connection point is on the left panel. * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel. * @DEVICE_PANEL_FRONT: Device connection point is on the front panel. * @DEVICE_PANEL_BACK: Device connection point is on the back panel. * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown. */ enum device_physical_location_panel { DEVICE_PANEL_TOP, DEVICE_PANEL_BOTTOM, DEVICE_PANEL_LEFT, DEVICE_PANEL_RIGHT, DEVICE_PANEL_FRONT, DEVICE_PANEL_BACK, DEVICE_PANEL_UNKNOWN, }; /** * enum device_physical_location_vertical_position - Describes vertical * position of the device connection point on the panel surface. * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel. * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel. * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel. */ enum device_physical_location_vertical_position { DEVICE_VERT_POS_UPPER, DEVICE_VERT_POS_CENTER, DEVICE_VERT_POS_LOWER, }; /** * enum device_physical_location_horizontal_position - Describes horizontal * position of the device connection point on the panel surface. * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel. * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel. * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel. */ enum device_physical_location_horizontal_position { DEVICE_HORI_POS_LEFT, DEVICE_HORI_POS_CENTER, DEVICE_HORI_POS_RIGHT, }; /** * struct device_physical_location - Device data related to physical location * of the device connection point. * @panel: Panel surface of the system's housing that the device connection * point resides on. * @vertical_position: Vertical position of the device connection point within * the panel. * @horizontal_position: Horizontal position of the device connection point * within the panel. * @dock: Set if the device connection point resides in a docking station or * port replicator. * @lid: Set if this device connection point resides on the lid of laptop * system. */ struct device_physical_location { enum device_physical_location_panel panel; enum device_physical_location_vertical_position vertical_position; enum device_physical_location_horizontal_position horizontal_position; bool dock; bool lid; }; /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. * In most cases, a parent device is some sort of bus or host * controller. If parent is NULL, the device, is a top-level device, * which is not usually what you want. * @p: Holds the private data of the driver core portions of the device. * See the comment of the struct device_private for detail. * @kobj: A top-level, abstract class from which other classes are derived. * @init_name: Initial name of the device. * @type: The type of device. * This identifies the device type and carries type-specific * information. * @mutex: Mutex to synchronize calls to its driver. * @bus: Type of bus device is on. * @driver: Which driver has allocated this * @platform_data: Platform data specific to the device. * Example: For devices on custom boards, as typical of embedded * and SOC based hardware, Linux often uses platform_data to point * to board-specific structures describing devices and how they * are wired. That can include what ports are available, chip * variants, which GPIO pins act in what additional roles, and so * on. This shrinks the "Board Support Packages" (BSPs) and * minimizes board-specific #ifdefs in drivers. * @driver_data: Private pointer for driver specific info. * @links: Links to suppliers and consumers of this device. * @power: For device power management. * See Documentation/driver-api/pm/devices.rst for details. * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. * @msi: MSI related data * @numa_node: NUMA node this device is close to. * @dma_ops: DMA mapping operations for this device. * @dma_mask: Dma mask (if dma'ble device). * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all * hardware supports 64-bit addresses for consistent allocations * such descriptors. * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller * DMA limit than the device itself supports. * @dma_range_map: map for DMA memory ranges relative to that of RAM * @dma_parms: A low level driver may set these to teach IOMMU code about * segment limitations. * @dma_pools: Dma pools (if dma'ble device). * @dma_mem: Internal for coherent mem override. * @cma_area: Contiguous memory area for dma allocations * @dma_io_tlb_mem: Software IO TLB allocator. Not for driver use. * @dma_io_tlb_pools: List of transient swiotlb memory pools. * @dma_io_tlb_lock: Protects changes to the list of active pools. * @dma_uses_io_tlb: %true if device has used the software IO TLB. * @archdata: For arch-specific additions. * @of_node: Associated device tree node. * @fwnode: Associated device node supplied by platform firmware. * @devt: For creating the sysfs "dev". * @id: device instance * @devres_lock: Spinlock to protect the resource of the device. * @devres_head: The resources list of the device. * @class: The class of the device. * @groups: Optional attribute groups. * @release: Callback to free the device after all references have * gone away. This should be set by the allocator of the * device (i.e. the bus driver that discovered the device). * @iommu_group: IOMMU group the device belongs to. * @iommu: Per device generic IOMMU runtime data * @physical_location: Describes physical location of the device connection * point in the system housing. * @removable: Whether the device can be removed from the system. This * should be set by the subsystem / bus driver that discovered * the device. * * @offline_disabled: If set, the device is permanently online. * @offline: Set after successful invocation of bus type's .offline(). * @of_node_reused: Set if the device-tree node is shared with an ancestor * device. * @state_synced: The hardware state of this device has been synced to match * the software state of this device by calling the driver/bus * sync_state() callback. * @can_match: The device has matched with a driver at least once or it is in * a bus (like AMBA) which can't check for matching drivers until * other devices probe successfully. * @dma_coherent: this particular device is dma coherent, even if the * architecture supports non-coherent devices. * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the * streaming DMA operations (->map_* / ->unmap_* / ->sync_*), * and optionall (if the coherent mask is large enough) also * for dma allocations. This flag is managed by the dma ops * instance from ->dma_supported. * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers. * @dma_iommu: Device is using default IOMMU implementation for DMA and * doesn't rely on dma_ops structure. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information * that the device model core needs to model the system. Most subsystems, * however, track additional information about the devices they host. As a * result, it is rare for devices to be represented by bare device structures; * instead, that structure, like kobject structures, is usually embedded within * a higher-level representation of the device. */ struct device { struct kobject kobj; struct device *parent; struct device_private *p; const char *init_name; /* initial name of the device */ const struct device_type *type; const struct bus_type *bus; /* type of bus device is on */ struct device_driver *driver; /* which driver has allocated this device */ void *platform_data; /* Platform specific data, device core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ struct mutex mutex; /* mutex to synchronize calls to * its driver. */ struct dev_links_info links; struct dev_pm_info power; struct dev_pm_domain *pm_domain; #ifdef CONFIG_ENERGY_MODEL struct em_perf_domain *em_pd; #endif #ifdef CONFIG_PINCTRL struct dev_pin_info *pins; #endif struct dev_msi_info msi; #ifdef CONFIG_ARCH_HAS_DMA_OPS const struct dma_map_ops *dma_ops; #endif u64 *dma_mask; /* dma mask (if dma'able device) */ u64 coherent_dma_mask;/* Like dma_mask, but for alloc_coherent mappings as not all hardware supports 64 bit addresses for consistent allocations such descriptors. */ u64 bus_dma_limit; /* upstream dma constraint */ const struct bus_dma_region *dma_range_map; struct device_dma_parameters *dma_parms; struct list_head dma_pools; /* dma pools (if dma'ble) */ #ifdef CONFIG_DMA_DECLARE_COHERENT struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */ #endif #ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */ #endif #ifdef CONFIG_SWIOTLB struct io_tlb_mem *dma_io_tlb_mem; #endif #ifdef CONFIG_SWIOTLB_DYNAMIC struct list_head dma_io_tlb_pools; spinlock_t dma_io_tlb_lock; bool dma_uses_io_tlb; #endif /* arch specific additions */ struct dev_archdata archdata; struct device_node *of_node; /* associated device tree node */ struct fwnode_handle *fwnode; /* firmware device node */ #ifdef CONFIG_NUMA int numa_node; /* NUMA node this device is close to */ #endif dev_t devt; /* dev_t, creates the sysfs "dev" */ u32 id; /* device instance */ spinlock_t devres_lock; struct list_head devres_head; const struct class *class; const struct attribute_group **groups; /* optional groups */ void (*release)(struct device *dev); struct iommu_group *iommu_group; struct dev_iommu *iommu; struct device_physical_location *physical_location; enum device_removable removable; bool offline_disabled:1; bool offline:1; bool of_node_reused:1; bool state_synced:1; bool can_match:1; #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_coherent:1; #endif #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif #ifdef CONFIG_DMA_NEED_SYNC bool dma_skip_sync:1; #endif #ifdef CONFIG_IOMMU_DMA bool dma_iommu:1; #endif }; /** * struct device_link - Device link representation. * @supplier: The device on the supplier end of the link. * @s_node: Hook to the supplier device's list of links to consumers. * @consumer: The device on the consumer end of the link. * @c_node: Hook to the consumer device's list of links to suppliers. * @link_dev: device used to expose link details in sysfs * @status: The state of the link (with respect to the presence of drivers). * @flags: Link flags. * @rpm_active: Whether or not the consumer device is runtime-PM-active. * @kref: Count repeated addition of the same link. * @rm_work: Work structure used for removing the link. * @supplier_preactivated: Supplier has been made active before consumer probe. */ struct device_link { struct device *supplier; struct list_head s_node; struct device *consumer; struct list_head c_node; struct device link_dev; enum device_link_state status; u32 flags; refcount_t rpm_active; struct kref kref; struct work_struct rm_work; bool supplier_preactivated; /* Owned by consumer probe. */ }; #define kobj_to_dev(__kobj) container_of_const(__kobj, struct device, kobj) /** * device_iommu_mapped - Returns true when the device DMA is translated * by an IOMMU * @dev: Device to perform the check on */ static inline bool device_iommu_mapped(struct device *dev) { return (dev->iommu_group != NULL); } /* Get the wakeup routines, which depend on struct device */ #include <linux/pm_wakeup.h> /** * dev_name - Return a device's name. * @dev: Device with name to get. * Return: The kobject name of the device, or its initial name if unavailable. */ static inline const char *dev_name(const struct device *dev) { /* Use the init name until the kobject becomes available */ if (dev->init_name) return dev->init_name; return kobject_name(&dev->kobj); } /** * dev_bus_name - Return a device's bus/class name, if at all possible * @dev: struct device to get the bus/class name of * * Will return the name of the bus/class the device is attached to. If it is * not attached to a bus/class, an empty string will be returned. */ static inline const char *dev_bus_name(const struct device *dev) { return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : ""); } __printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...); #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) { return dev->numa_node; } static inline void set_dev_node(struct device *dev, int node) { dev->numa_node = node; } #else static inline int dev_to_node(struct device *dev) { return NUMA_NO_NODE; } static inline void set_dev_node(struct device *dev, int node) { } #endif static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) { #ifdef CONFIG_GENERIC_MSI_IRQ return dev->msi.domain; #else return NULL; #endif } static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d) { #ifdef CONFIG_GENERIC_MSI_IRQ dev->msi.domain = d; #endif } static inline void *dev_get_drvdata(const struct device *dev) { return dev->driver_data; } static inline void dev_set_drvdata(struct device *dev, void *data) { dev->driver_data = data; } static inline struct pm_subsys_data *dev_to_psd(struct device *dev) { return dev ? dev->power.subsys_data : NULL; } static inline unsigned int dev_get_uevent_suppress(const struct device *dev) { return dev->kobj.uevent_suppress; } static inline void dev_set_uevent_suppress(struct device *dev, int val) { dev->kobj.uevent_suppress = val; } static inline int device_is_registered(struct device *dev) { return dev->kobj.state_in_sysfs; } static inline void device_enable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = true; } static inline void device_disable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = false; } static inline bool device_async_suspend_enabled(struct device *dev) { return !!dev->power.async_suspend; } static inline bool device_pm_not_required(struct device *dev) { return dev->power.no_pm; } static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; } static inline void dev_pm_syscore_device(struct device *dev, bool val) { #ifdef CONFIG_PM_SLEEP dev->power.syscore = val; #endif } static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags) { dev->power.driver_flags = flags; } static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags) { return !!(dev->power.driver_flags & flags); } static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); } static inline int device_lock_interruptible(struct device *dev) { return mutex_lock_interruptible(&dev->mutex); } static inline int device_trylock(struct device *dev) { return mutex_trylock(&dev->mutex); } static inline void device_unlock(struct device *dev) { mutex_unlock(&dev->mutex); } DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T)) static inline void device_lock_assert(struct device *dev) { lockdep_assert_held(&dev->mutex); } static inline bool dev_has_sync_state(struct device *dev) { if (!dev) return false; if (dev->driver && dev->driver->sync_state) return true; if (dev->bus && dev->bus->sync_state) return true; return false; } static inline void dev_set_removable(struct device *dev, enum device_removable removable) { dev->removable = removable; } static inline bool dev_is_removable(struct device *dev) { return dev->removable == DEVICE_REMOVABLE; } static inline bool dev_removable_is_valid(struct device *dev) { return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED; } /* * High level routines for use by the bus drivers */ int __must_check device_register(struct device *dev); void device_unregister(struct device *dev); void device_initialize(struct device *dev); int __must_check device_add(struct device *dev); void device_del(struct device *dev); DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T)) int device_for_each_child(struct device *dev, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse(struct device *dev, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse_from(struct device *parent, struct device *from, const void *data, int (*fn)(struct device *, const void *)); struct device *device_find_child(struct device *dev, void *data, int (*match)(struct device *dev, void *data)); struct device *device_find_child_by_name(struct device *parent, const char *name); struct device *device_find_any_child(struct device *parent); int device_rename(struct device *dev, const char *new_name); int device_move(struct device *dev, struct device *new_parent, enum dpm_order dpm_order); int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); static inline bool device_supports_offline(struct device *dev) { return dev->bus && dev->bus->offline && dev->bus->online; } #define __device_lock_set_class(dev, name, key) \ do { \ struct device *__d2 __maybe_unused = dev; \ lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \ } while (0) /** * device_lock_set_class - Specify a temporary lock class while a device * is attached to a driver * @dev: device to modify * @key: lock class key data * * This must be called with the device_lock() already held, for example * from driver ->probe(). Take care to only override the default * lockdep_no_validate class. */ #ifdef CONFIG_LOCKDEP #define device_lock_set_class(dev, key) \ do { \ struct device *__d = dev; \ dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex, \ &__lockdep_no_validate__), \ "overriding existing custom lock class\n"); \ __device_lock_set_class(__d, #key, key); \ } while (0) #else #define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key) #endif /** * device_lock_reset_class - Return a device to the default lockdep novalidate state * @dev: device to modify * * This must be called with the device_lock() already held, for example * from driver ->remove(). */ #define device_lock_reset_class(dev) \ do { \ struct device *__d __maybe_unused = dev; \ lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex", \ _THIS_IP_); \ } while (0) void lock_device_hotplug(void); void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); int device_offline(struct device *dev); int device_online(struct device *dev); void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void device_set_node(struct device *dev, struct fwnode_handle *fwnode); void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); static inline struct device_node *dev_of_node(struct device *dev) { if (!IS_ENABLED(CONFIG_OF) || !dev) return NULL; return dev->of_node; } static inline int dev_num_vf(struct device *dev) { if (dev->bus && dev->bus->num_vf) return dev->bus->num_vf(dev); return 0; } /* * Root device objects for grouping under /sys/devices */ struct device *__root_device_register(const char *name, struct module *owner); /* This is a macro to avoid include problems with THIS_MODULE */ #define root_device_register(name) \ __root_device_register(name, THIS_MODULE) void root_device_unregister(struct device *root); static inline void *dev_get_platdata(const struct device *dev) { return dev->platform_data; } /* * Manual binding of a device to driver. See drivers/base/bus.c * for information on use. */ int __must_check device_driver_attach(const struct device_driver *drv, struct device *dev); int __must_check device_bind_driver(struct device *dev); void device_release_driver(struct device *dev); int __must_check device_attach(struct device *dev); int __must_check driver_attach(const struct device_driver *drv); void device_initial_probe(struct device *dev); int __must_check device_reprobe(struct device *dev); bool device_is_bound(struct device *dev); /* * Easy functions for dynamically creating devices on the fly */ __printf(5, 6) struct device * device_create(const struct class *cls, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...); __printf(6, 7) struct device * device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt, void *drvdata, const struct attribute_group **groups, const char *fmt, ...); void device_destroy(const struct class *cls, dev_t devt); int __must_check device_add_groups(struct device *dev, const struct attribute_group **groups); void device_remove_groups(struct device *dev, const struct attribute_group **groups); static inline int __must_check device_add_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; return device_add_groups(dev, groups); } static inline void device_remove_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; return device_remove_groups(dev, groups); } int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); /* * get_device - atomically increment the reference count for the device. * */ struct device *get_device(struct device *dev); void put_device(struct device *dev); DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T)) bool kill_device(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_mount(void); #else static inline int devtmpfs_mount(void) { return 0; } #endif /* drivers/base/power/shutdown.c */ void device_shutdown(void); /* debugging and troubleshooting/diagnostic helpers. */ const char *dev_driver_string(const struct device *dev); /* Device links interface. */ struct device_link *device_link_add(struct device *consumer, struct device *supplier, u32 flags); void device_link_del(struct device_link *link); void device_link_remove(void *consumer, struct device *supplier); void device_links_supplier_sync_state_pause(void); void device_links_supplier_sync_state_resume(void); void device_link_wait_removal(void); /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_CHARDEV_MAJOR(major) \ MODULE_ALIAS("char-major-" __stringify(major) "-*") #endif /* _DEVICE_H_ */ |
3 4 3 3 3 3 1 1 1 1 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 | // SPDX-License-Identifier: GPL-2.0-only /* * vhost transport for vsock * * Copyright (C) 2013-2015 Red Hat, Inc. * Author: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/miscdevice.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <net/sock.h> #include <linux/virtio_vsock.h> #include <linux/vhost.h> #include <linux/hashtable.h> #include <net/af_vsock.h> #include "vhost.h" #define VHOST_VSOCK_DEFAULT_HOST_CID 2 /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_VSOCK_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with * small pkts. */ #define VHOST_VSOCK_PKT_WEIGHT 256 enum { VHOST_VSOCK_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_VSOCK_F_SEQPACKET) }; enum { VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; /* Used to track all the vhost_vsock instances on the system. */ static DEFINE_MUTEX(vhost_vsock_mutex); static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8); struct vhost_vsock { struct vhost_dev dev; struct vhost_virtqueue vqs[2]; /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */ struct hlist_node hash; struct vhost_work send_pkt_work; struct sk_buff_head send_pkt_queue; /* host->guest pending packets */ atomic_t queued_replies; u32 guest_cid; bool seqpacket_allow; }; static u32 vhost_transport_get_local_cid(void) { return VHOST_VSOCK_DEFAULT_HOST_CID; } /* Callers that dereference the return value must hold vhost_vsock_mutex or the * RCU read lock. */ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid) { struct vhost_vsock *vsock; hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) { u32 other_cid = vsock->guest_cid; /* Skip instances that have no CID yet */ if (other_cid == 0) continue; if (other_cid == guest_cid) return vsock; } return NULL; } static void vhost_transport_do_send_pkt(struct vhost_vsock *vsock, struct vhost_virtqueue *vq) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int pkts = 0, total_len = 0; bool added = false; bool restart_tx = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; /* Avoid further vmexits, we're already processing the virtqueue */ vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; size_t iov_len, payload_len; struct iov_iter iov_iter; u32 flags_to_restore = 0; struct sk_buff *skb; unsigned out, in; size_t nbytes; u32 offset; int head; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) { vhost_enable_notify(&vsock->dev, vq); break; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } if (head == vq->num) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); /* We cannot finish yet if more buffers snuck in while * re-enabling notify. */ if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } if (out) { kfree_skb(skb); vq_err(vq, "Expected 0 output buffers, got %u\n", out); break; } iov_len = iov_length(&vq->iov[out], in); if (iov_len < sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Buffer len [%zu] too small\n", iov_len); break; } iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len); offset = VIRTIO_VSOCK_SKB_CB(skb)->offset; payload_len = skb->len - offset; hdr = virtio_vsock_hdr(skb); /* If the packet is greater than the space available in the * buffer, we split it using multiple buffers. */ if (payload_len > iov_len - sizeof(*hdr)) { payload_len = iov_len - sizeof(*hdr); /* As we are copying pieces of large packet's buffer to * small rx buffers, headers of packets in rx queue are * created dynamically and are initialized with header * of current packet(except length). But in case of * SOCK_SEQPACKET, we also must clear message delimeter * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise, * there will be sequence of packets with these * bits set. After initialized header will be copied to * rx buffer, these required bits will be restored. */ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR; } } } /* Set the correct length in the header */ hdr->len = cpu_to_le32(payload_len); nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt hdr\n"); break; } if (skb_copy_datagram_iter(skb, offset, &iov_iter, payload_len)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt buf\n"); break; } /* Deliver to monitoring devices all packets that we * will transmit. */ virtio_transport_deliver_tap_pkt(skb); vhost_add_used(vq, head, sizeof(*hdr) + payload_len); added = true; VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len; total_len += payload_len; /* If we didn't send all the payload we can requeue the packet * to send it with the next available buffer. */ if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) { hdr->flags |= cpu_to_le32(flags_to_restore); /* We are queueing the same skb to handle * the remaining bytes, and we want to deliver it * to monitoring devices in the next iteration. */ virtio_vsock_skb_clear_tap_delivered(skb); virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); } else { if (virtio_vsock_skb_reply(skb)) { int val; val = atomic_dec_return(&vsock->queued_replies); /* Do we have resources to resume tx * processing? */ if (val + 1 == tx_vq->num) restart_tx = true; } virtio_transport_consume_skb_sent(skb, true); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); if (restart_tx) vhost_poll_queue(&tx_vq->poll); } static void vhost_transport_send_pkt_work(struct vhost_work *work) { struct vhost_virtqueue *vq; struct vhost_vsock *vsock; vsock = container_of(work, struct vhost_vsock, send_pkt_work); vq = &vsock->vqs[VSOCK_VQ_RX]; vhost_transport_do_send_pkt(vsock, vq); } static int vhost_transport_send_pkt(struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vhost_vsock *vsock; int len = skb->len; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid)); if (!vsock) { rcu_read_unlock(); kfree_skb(skb); return -ENODEV; } if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); rcu_read_unlock(); return len; } static int vhost_transport_cancel_pkt(struct vsock_sock *vsk) { struct vhost_vsock *vsock; int cnt = 0; int ret = -ENODEV; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); if (!vsock) goto out; cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int new_cnt; new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) vhost_poll_queue(&tx_vq->poll); } ret = 0; out: rcu_read_unlock(); return ret; } static struct sk_buff * vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, unsigned int out, unsigned int in) { struct virtio_vsock_hdr *hdr; struct iov_iter iov_iter; struct sk_buff *skb; size_t payload_len; size_t nbytes; size_t len; if (in != 0) { vq_err(vq, "Expected 0 input buffers, got %u\n", in); return NULL; } len = iov_length(vq->iov, out); /* len contains both payload and hdr */ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); if (!skb) return NULL; iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len); hdr = virtio_vsock_hdr(skb); nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", sizeof(*hdr), nbytes); kfree_skb(skb); return NULL; } payload_len = le32_to_cpu(hdr->len); /* No payload */ if (!payload_len) return skb; /* The pkt is too big or the length in the header is invalid */ if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || payload_len + sizeof(*hdr) > len) { kfree_skb(skb); return NULL; } virtio_vsock_skb_rx_put(skb); nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); if (nbytes != payload_len) { vq_err(vq, "Expected %zu byte payload, got %zu bytes\n", payload_len, nbytes); kfree_skb(skb); return NULL; } return skb; } /* Is there space left for replies to rx packets? */ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX]; int val; smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ val = atomic_read(&vsock->queued_replies); return val < vq->num; } static bool vhost_transport_msgzerocopy_allow(void) { return true; } static bool vhost_transport_seqpacket_allow(u32 remote_cid); static struct virtio_transport vhost_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vhost_transport_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vhost_transport_cancel_pkt, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_bind = virtio_transport_dgram_bind, .dgram_allow = virtio_transport_dgram_allow, .stream_enqueue = virtio_transport_stream_enqueue, .stream_dequeue = virtio_transport_stream_dequeue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vhost_transport_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vhost_transport_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .unsent_bytes = virtio_transport_unsent_bytes, .read_skb = virtio_transport_read_skb, }, .send_pkt = vhost_transport_send_pkt, }; static bool vhost_transport_seqpacket_allow(u32 remote_cid) { struct vhost_vsock *vsock; bool seqpacket_allow = false; rcu_read_lock(); vsock = vhost_vsock_get(remote_cid); if (vsock) seqpacket_allow = vsock->seqpacket_allow; rcu_read_unlock(); return seqpacket_allow; } static void vhost_vsock_handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); int head, pkts = 0, total_len = 0; unsigned int out, in; struct sk_buff *skb; bool added = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; if (!vhost_vsock_more_replies(vsock)) { /* Stop tx until the device processes already * pending replies. Leave tx virtqueue * callbacks disabled. */ goto no_more_replies; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) break; if (head == vq->num) { if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } skb = vhost_vsock_alloc_skb(vq, out, in); if (!skb) { vq_err(vq, "Faulted on pkt\n"); continue; } total_len += sizeof(*hdr) + skb->len; /* Deliver to monitoring devices all received packets */ virtio_transport_deliver_tap_pkt(skb); hdr = virtio_vsock_hdr(skb); /* Only accept correctly addressed packets */ if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid && le64_to_cpu(hdr->dst_cid) == vhost_transport_get_local_cid()) virtio_transport_recv_pkt(&vhost_transport, skb); else kfree_skb(skb); vhost_add_used(vq, head, 0); added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); no_more_replies: if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); } static void vhost_vsock_handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); vhost_transport_do_send_pkt(vsock, vq); } static int vhost_vsock_start(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq; size_t i; int ret; mutex_lock(&vsock->dev.mutex); ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); if (!vhost_vq_access_ok(vq)) { ret = -EFAULT; goto err_vq; } if (!vhost_vq_get_backend(vq)) { vhost_vq_set_backend(vq, vsock); ret = vhost_vq_init_access(vq); if (ret) goto err_vq; } mutex_unlock(&vq->mutex); } /* Some packets may have been queued before the device was started, * let's kick the send worker to send them. */ vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); mutex_unlock(&vsock->dev.mutex); return 0; err_vq: vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner) { size_t i; int ret = 0; mutex_lock(&vsock->dev.mutex); if (check_owner) { ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; } for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { struct vhost_virtqueue *vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static void vhost_vsock_free(struct vhost_vsock *vsock) { kvfree(vsock); } static int vhost_vsock_dev_open(struct inode *inode, struct file *file) { struct vhost_virtqueue **vqs; struct vhost_vsock *vsock; int ret; /* This struct is large and allocation could fail, fall back to vmalloc * if there is no other way. */ vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!vsock) return -ENOMEM; vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL); if (!vqs) { ret = -ENOMEM; goto out; } vsock->guest_cid = 0; /* no CID assigned yet */ vsock->seqpacket_allow = false; atomic_set(&vsock->queued_replies, 0); vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX]; vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX]; vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT, true, NULL); file->private_data = vsock; skb_queue_head_init(&vsock->send_pkt_queue); vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); return 0; out: vhost_vsock_free(vsock); return ret; } static void vhost_vsock_flush(struct vhost_vsock *vsock) { vhost_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); /* vmci_transport.c doesn't take sk_lock here either. At least we're * under vsock_table_lock so the sock cannot disappear while we're * executing. */ /* If the peer is still valid, no need to reset connection */ if (vhost_vsock_get(vsk->remote_addr.svm_cid)) return; /* If the close timeout is pending, let it expire. This avoids races * with the timeout callback. */ if (vsk->close_work_scheduled) return; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk_error_report(sk); } static int vhost_vsock_dev_release(struct inode *inode, struct file *file) { struct vhost_vsock *vsock = file->private_data; mutex_lock(&vhost_vsock_mutex); if (vsock->guest_cid) hash_del_rcu(&vsock->hash); mutex_unlock(&vhost_vsock_mutex); /* Wait for other CPUs to finish using vsock */ synchronize_rcu(); /* Iterating over all connections for all CIDs to find orphans is * inefficient. Room for improvement here. */ vsock_for_each_connected_socket(&vhost_transport.transport, vhost_vsock_reset_orphans); /* Don't check the owner, because we are in the release path, so we * need to stop the vsock device in any case. * vhost_vsock_stop() can not fail in this case, so we don't need to * check the return code. */ vhost_vsock_stop(vsock, false); vhost_vsock_flush(vsock); vhost_dev_stop(&vsock->dev); virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); vhost_dev_cleanup(&vsock->dev); kfree(vsock->dev.vqs); vhost_vsock_free(vsock); return 0; } static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid) { struct vhost_vsock *other; /* Refuse reserved CIDs */ if (guest_cid <= VMADDR_CID_HOST || guest_cid == U32_MAX) return -EINVAL; /* 64-bit CIDs are not yet supported */ if (guest_cid > U32_MAX) return -EINVAL; /* Refuse if CID is assigned to the guest->host transport (i.e. nested * VM), to make the loopback work. */ if (vsock_find_cid(guest_cid)) return -EADDRINUSE; /* Refuse if CID is already in use */ mutex_lock(&vhost_vsock_mutex); other = vhost_vsock_get(guest_cid); if (other && other != vsock) { mutex_unlock(&vhost_vsock_mutex); return -EADDRINUSE; } if (vsock->guest_cid) hash_del_rcu(&vsock->hash); vsock->guest_cid = guest_cid; hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid); mutex_unlock(&vhost_vsock_mutex); return 0; } static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) { struct vhost_virtqueue *vq; int i; if (features & ~VHOST_VSOCK_FEATURES) return -EOPNOTSUPP; mutex_lock(&vsock->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&vsock->dev)) { goto err; } if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&vsock->dev)) goto err; } vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vq->acked_features = features; mutex_unlock(&vq->mutex); } mutex_unlock(&vsock->dev.mutex); return 0; err: mutex_unlock(&vsock->dev.mutex); return -EFAULT; } static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_vsock *vsock = f->private_data; void __user *argp = (void __user *)arg; u64 guest_cid; u64 features; int start; int r; switch (ioctl) { case VHOST_VSOCK_SET_GUEST_CID: if (copy_from_user(&guest_cid, argp, sizeof(guest_cid))) return -EFAULT; return vhost_vsock_set_cid(vsock, guest_cid); case VHOST_VSOCK_SET_RUNNING: if (copy_from_user(&start, argp, sizeof(start))) return -EFAULT; if (start) return vhost_vsock_start(vsock); else return vhost_vsock_stop(vsock, true); case VHOST_GET_FEATURES: features = VHOST_VSOCK_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; return vhost_vsock_set_features(vsock, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_VSOCK_BACKEND_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; if (features & ~VHOST_VSOCK_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&vsock->dev, features); return 0; default: mutex_lock(&vsock->dev.mutex); r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&vsock->dev, ioctl, argp); else vhost_vsock_flush(vsock); mutex_unlock(&vsock->dev.mutex); return r; } } static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait) { struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_vsock_fops = { .owner = THIS_MODULE, .open = vhost_vsock_dev_open, .release = vhost_vsock_dev_release, .llseek = noop_llseek, .unlocked_ioctl = vhost_vsock_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, .read_iter = vhost_vsock_chr_read_iter, .write_iter = vhost_vsock_chr_write_iter, .poll = vhost_vsock_chr_poll, }; static struct miscdevice vhost_vsock_misc = { .minor = VHOST_VSOCK_MINOR, .name = "vhost-vsock", .fops = &vhost_vsock_fops, }; static int __init vhost_vsock_init(void) { int ret; ret = vsock_core_register(&vhost_transport.transport, VSOCK_TRANSPORT_F_H2G); if (ret < 0) return ret; ret = misc_register(&vhost_vsock_misc); if (ret) { vsock_core_unregister(&vhost_transport.transport); return ret; } return 0; }; static void __exit vhost_vsock_exit(void) { misc_deregister(&vhost_vsock_misc); vsock_core_unregister(&vhost_transport.transport); }; module_init(vhost_vsock_init); module_exit(vhost_vsock_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("vhost transport for vsock "); MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR); MODULE_ALIAS("devname:vhost-vsock"); |
27 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Helpers for initial module or kernel cmdline parsing * Copyright (C) 2001 Rusty Russell. */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/kstrtox.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/overflow.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/string.h> #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ static DEFINE_MUTEX(param_lock); /* Use the module's mutex, or if built-in use the built-in mutex */ #ifdef CONFIG_MODULES #define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : ¶m_lock) #else #define KPARAM_MUTEX(mod) (¶m_lock) #endif static inline void check_kparam_locked(struct module *mod) { BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod))); } #else static inline void check_kparam_locked(struct module *mod) { } #endif /* !CONFIG_SYSFS */ /* This just allows us to keep track of which parameters are kmalloced. */ struct kmalloced_param { struct list_head list; char val[]; }; static LIST_HEAD(kmalloced_params); static DEFINE_SPINLOCK(kmalloced_params_lock); static void *kmalloc_parameter(unsigned int size) { struct kmalloced_param *p; p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL); if (!p) return NULL; spin_lock(&kmalloced_params_lock); list_add(&p->list, &kmalloced_params); spin_unlock(&kmalloced_params_lock); return p->val; } /* Does nothing if parameter wasn't kmalloced above. */ static void maybe_kfree_parameter(void *param) { struct kmalloced_param *p; spin_lock(&kmalloced_params_lock); list_for_each_entry(p, &kmalloced_params, list) { if (p->val == param) { list_del(&p->list); kfree(p); break; } } spin_unlock(&kmalloced_params_lock); } static char dash2underscore(char c) { if (c == '-') return '_'; return c; } bool parameqn(const char *a, const char *b, size_t n) { size_t i; for (i = 0; i < n; i++) { if (dash2underscore(a[i]) != dash2underscore(b[i])) return false; } return true; } bool parameq(const char *a, const char *b) { return parameqn(a, b, strlen(a)+1); } static bool param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_HWPARAM && security_locked_down(LOCKDOWN_MODULE_PARAMETERS)) return false; if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } return true; } static int parse_one(char *param, char *val, const char *doing, const struct kernel_param *params, unsigned num_params, s16 min_level, s16 max_level, void *arg, parse_unknown_fn handle_unknown) { unsigned int i; int err; /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { if (params[i].level < min_level || params[i].level > max_level) return 0; /* No one handled NULL, so do it here. */ if (!val && !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); if (param_check_unsafe(¶ms[i])) err = params[i].ops->set(val, ¶ms[i]); else err = -EPERM; kernel_param_unlock(params[i].mod); return err; } } if (handle_unknown) { pr_debug("doing %s: %s='%s'\n", doing, param, val); return handle_unknown(param, val, doing, arg); } pr_debug("Unknown argument '%s'\n", param); return -ENOENT; } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ char *parse_args(const char *doing, char *args, const struct kernel_param *params, unsigned num, s16 min_level, s16 max_level, void *arg, parse_unknown_fn unknown) { char *param, *val, *err = NULL; /* Chew leading spaces */ args = skip_spaces(args); if (*args) pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); while (*args) { int ret; int irq_was_disabled; args = next_arg(args, ¶m, &val); /* Stop at -- */ if (!val && strcmp(param, "--") == 0) return err ?: args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, arg, unknown); if (irq_was_disabled && !irqs_disabled()) pr_warn("%s: option '%s' enabled irq's!\n", doing, param); switch (ret) { case 0: continue; case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); break; case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); break; } err = ERR_PTR(ret); } return err; } /* Lazy bastard, eh? */ #define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ .set = param_set_##name, \ .get = param_get_##name, \ }; \ EXPORT_SYMBOL(param_set_##name); \ EXPORT_SYMBOL(param_get_##name); \ EXPORT_SYMBOL(param_ops_##name) STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); STANDARD_PARAM_DEF(long, long, "%li", kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); int param_set_uint_minmax(const char *val, const struct kernel_param *kp, unsigned int min, unsigned int max) { unsigned int num; int ret; if (!val) return -EINVAL; ret = kstrtouint(val, 0, &num); if (ret) return ret; if (num < min || num > max) return -EINVAL; *((unsigned int *)kp->arg) = num; return 0; } EXPORT_SYMBOL_GPL(param_set_uint_minmax); int param_set_charp(const char *val, const struct kernel_param *kp) { size_t len, maxlen = 1024; len = strnlen(val, maxlen + 1); if (len == maxlen + 1) { pr_err("%s: string parameter too long\n", kp->name); return -ENOSPC; } maybe_kfree_parameter(*(char **)kp->arg); /* * This is a hack. We can't kmalloc() in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { *(char **)kp->arg = kmalloc_parameter(len + 1); if (!*(char **)kp->arg) return -ENOMEM; strcpy(*(char **)kp->arg, val); } else *(const char **)kp->arg = val; return 0; } EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, .get = param_get_charp, .free = param_free_charp, }; EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ return kstrtobool(val, kp->arg); } EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); const struct kernel_param_ops param_ops_bool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; EXPORT_SYMBOL(param_ops_bool); int param_set_bool_enable_only(const char *val, const struct kernel_param *kp) { int err; bool new_value; bool orig_value = *(bool *)kp->arg; struct kernel_param dummy_kp = *kp; dummy_kp.arg = &new_value; err = param_set_bool(val, &dummy_kp); if (err) return err; /* Don't let them unset it once it's set! */ if (!new_value && orig_value) return -EROFS; if (new_value) err = param_set_bool(val, kp); return err; } EXPORT_SYMBOL_GPL(param_set_bool_enable_only); const struct kernel_param_ops param_ops_bool_enable_only = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; EXPORT_SYMBOL_GPL(param_ops_bool_enable_only); /* This one must be bool. */ int param_set_invbool(const char *val, const struct kernel_param *kp) { int ret; bool boolval; struct kernel_param dummy; dummy.arg = &boolval; ret = param_set_bool(val, &dummy); if (ret == 0) *(bool *)kp->arg = !boolval; return ret; } EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); const struct kernel_param_ops param_ops_invbool = { .set = param_set_invbool, .get = param_get_invbool, }; EXPORT_SYMBOL(param_ops_invbool); int param_set_bint(const char *val, const struct kernel_param *kp) { /* Match bool exactly, by re-using it. */ struct kernel_param boolkp = *kp; bool v; int ret; boolkp.arg = &v; ret = param_set_bool(val, &boolkp); if (ret == 0) *(int *)kp->arg = v; return ret; } EXPORT_SYMBOL(param_set_bint); const struct kernel_param_ops param_ops_bint = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; EXPORT_SYMBOL(param_ops_bint); /* We break the rule and mangle the string. */ static int param_array(struct module *mod, const char *name, const char *val, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, const struct kernel_param *kp), s16 level, unsigned int *num) { int ret; struct kernel_param kp; char save; /* Get the name right for errors. */ kp.name = name; kp.arg = elem; kp.level = level; *num = 0; /* We expect a comma-separated list of values. */ do { int len; if (*num == max) { pr_err("%s: can only take %i arguments\n", name, max); return -EINVAL; } len = strcspn(val, ","); /* nul-terminate and parse */ save = val[len]; ((char *)val)[len] = '\0'; check_kparam_locked(mod); ret = set(val, &kp); if (ret != 0) return ret; kp.arg += elemsize; val += len+1; (*num)++; } while (save == ','); if (*num < min) { pr_err("%s: needs at least %i arguments\n", name, min); return -EINVAL; } return 0; } static int param_array_set(const char *val, const struct kernel_param *kp) { const struct kparam_array *arr = kp->arr; unsigned int temp_num; return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem, arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } static int param_array_get(char *buffer, const struct kernel_param *kp) { int i, off, ret; const struct kparam_array *arr = kp->arr; struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { /* Replace \n with comma */ if (i) buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); if (ret < 0) return ret; off += ret; } buffer[off] = '\0'; return off; } static void param_array_free(void *arg) { unsigned int i; const struct kparam_array *arr = arg; if (arr->ops->free) for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) arr->ops->free(arr->elem + arr->elemsize * i); } const struct kernel_param_ops param_array_ops = { .set = param_array_set, .get = param_array_get, .free = param_array_free, }; EXPORT_SYMBOL(param_array_ops); int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; if (strnlen(val, kps->maxlen) == kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } strcpy(kps->string, val); return 0; } EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); const struct kernel_param_ops param_ops_string = { .set = param_set_copystring, .get = param_get_string, }; EXPORT_SYMBOL(param_ops_string); /* sysfs output in /sys/modules/XYZ/parameters/ */ #define to_module_attr(n) container_of(n, struct module_attribute, attr) #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) struct param_attribute { struct module_attribute mattr; const struct kernel_param *param; }; struct module_param_attrs { unsigned int num; struct attribute_group grp; struct param_attribute attrs[]; }; #ifdef CONFIG_SYSFS #define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, struct module_kobject *mk, char *buf) { int count; struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->get) return -EPERM; kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); return count; } /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, struct module_kobject *mk, const char *buf, size_t len) { int err; struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->set) return -EPERM; kernel_param_lock(mk->mod); if (param_check_unsafe(attribute->param)) err = attribute->param->ops->set(buf, attribute->param); else err = -EPERM; kernel_param_unlock(mk->mod); if (!err) return len; return err; } #endif #ifdef CONFIG_MODULES #define __modinit #else #define __modinit __init #endif #ifdef CONFIG_SYSFS void kernel_param_lock(struct module *mod) { mutex_lock(KPARAM_MUTEX(mod)); } void kernel_param_unlock(struct module *mod) { mutex_unlock(KPARAM_MUTEX(mod)); } EXPORT_SYMBOL(kernel_param_lock); EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and * create file in sysfs. Returns an error on out of memory. Always cleans up * if there's an error. */ static __modinit int add_sysfs_param(struct module_kobject *mk, const struct kernel_param *kp, const char *name) { struct module_param_attrs *new_mp; struct attribute **new_attrs; unsigned int i; /* We don't bother calling this with invisible parameters. */ BUG_ON(!kp->perm); if (!mk->mp) { /* First allocation. */ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); if (!mk->mp) return -ENOMEM; mk->mp->grp.name = "parameters"; /* NULL-terminated attribute array. */ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL); /* Caller will cleanup via free_module_param_attrs */ if (!mk->mp->grp.attrs) return -ENOMEM; } /* Enlarge allocations. */ new_mp = krealloc(mk->mp, sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), GFP_KERNEL); if (!new_mp) return -ENOMEM; mk->mp = new_mp; /* Extra pointer for NULL terminator */ new_attrs = krealloc(mk->mp->grp.attrs, sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), GFP_KERNEL); if (!new_attrs) return -ENOMEM; mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); mk->mp->attrs[mk->mp->num].param = kp; mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; else mk->mp->attrs[mk->mp->num].mattr.store = NULL; mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; mk->mp->num++; /* Fix up all the pointers, since krealloc can move us */ for (i = 0; i < mk->mp->num; i++) mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; mk->mp->grp.attrs[mk->mp->num] = NULL; return 0; } #ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { if (mk->mp) kfree(mk->mp->grp.attrs); kfree(mk->mp); mk->mp = NULL; } /* * module_param_sysfs_setup - setup sysfs support for one module * @mod: module * @kparam: module parameters (array) * @num_params: number of module parameters * * Adds sysfs entries for module parameters under * /sys/module/[mod->name]/parameters/ */ int module_param_sysfs_setup(struct module *mod, const struct kernel_param *kparam, unsigned int num_params) { int i, err; bool params = false; for (i = 0; i < num_params; i++) { if (kparam[i].perm == 0) continue; err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); if (err) { free_module_param_attrs(&mod->mkobj); return err; } params = true; } if (!params) return 0; /* Create the param group. */ err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); if (err) free_module_param_attrs(&mod->mkobj); return err; } /* * module_param_sysfs_remove - remove sysfs support for one module * @mod: module * * Remove sysfs entries for module parameters and the corresponding * kobject. */ void module_param_sysfs_remove(struct module *mod) { if (mod->mkobj.mp) { sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); /* * We are positive that no one is using any param * attrs at this point. Deallocate immediately. */ free_module_param_attrs(&mod->mkobj); } } #endif void destroy_params(const struct kernel_param *params, unsigned num) { unsigned int i; for (i = 0; i < num; i++) if (params[i].ops->free) params[i].ops->free(params[i].arg); } static struct module_kobject * __init locate_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; int err; kobj = kset_find_obj(module_kset, name); if (kobj) { mk = to_module_kobject(kobj); } else { mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); mk->mod = THIS_MODULE; mk->kobj.kset = module_kset; err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); #ifdef CONFIG_MODULES if (!err) err = sysfs_create_file(&mk->kobj, &module_uevent.attr); #endif if (err) { kobject_put(&mk->kobj); pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", name, err); return NULL; } /* So that we hold reference in both cases. */ kobject_get(&mk->kobj); } return mk; } static void __init kernel_add_sysfs_param(const char *name, const struct kernel_param *kparam, unsigned int name_skip) { struct module_kobject *mk; int err; mk = locate_module_kobject(name); if (!mk) return; /* We need to remove old parameters before adding more. */ if (mk->mp) sysfs_remove_group(&mk->kobj, &mk->mp->grp); /* These should not fail at boot. */ err = add_sysfs_param(mk, kparam, kparam->name + name_skip); BUG_ON(err); err = sysfs_create_group(&mk->kobj, &mk->mp->grp); BUG_ON(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } /* * param_sysfs_builtin - add sysfs parameters for built-in modules * * Add module_parameters to sysfs for "modules" built into the kernel. * * The "module" name (KBUILD_MODNAME) is stored before a dot, the * "parameter" name is stored behind a dot in kernel_param->name. So, * extract the "module" name for all built-in kernel_param-eters, * and for all who have the same, call kernel_add_sysfs_param. */ static void __init param_sysfs_builtin(void) { const struct kernel_param *kp; unsigned int name_len; char modname[MODULE_NAME_LEN]; for (kp = __start___param; kp < __stop___param; kp++) { char *dot; if (kp->perm == 0) continue; dot = strchr(kp->name, '.'); if (!dot) { /* This happens for core_param() */ strcpy(modname, "kernel"); name_len = 0; } else { name_len = dot - kp->name + 1; strscpy(modname, kp->name, name_len); } kernel_add_sysfs_param(modname, kp, name_len); } } ssize_t __modver_version_show(struct module_attribute *mattr, struct module_kobject *mk, char *buf) { struct module_version_attribute *vattr = container_of(mattr, struct module_version_attribute, mattr); return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); } extern const struct module_version_attribute __start___modver[]; extern const struct module_version_attribute __stop___modver[]; static void __init version_sysfs_builtin(void) { const struct module_version_attribute *vattr; struct module_kobject *mk; int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } } } /* module-related sysfs stuff */ static ssize_t module_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct module_attribute *attribute; struct module_kobject *mk; int ret; attribute = to_module_attr(attr); mk = to_module_kobject(kobj); if (!attribute->show) return -EIO; ret = attribute->show(attribute, mk, buf); return ret; } static ssize_t module_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct module_attribute *attribute; struct module_kobject *mk; int ret; attribute = to_module_attr(attr); mk = to_module_kobject(kobj); if (!attribute->store) return -EIO; ret = attribute->store(attribute, mk, buf, len); return ret; } static const struct sysfs_ops module_sysfs_ops = { .show = module_attr_show, .store = module_attr_store, }; static int uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &module_ktype) return 1; return 0; } static const struct kset_uevent_ops module_uevent_ops = { .filter = uevent_filter, }; struct kset *module_kset; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); complete(mk->kobj_completion); } const struct kobj_type module_ktype = { .release = module_kobj_release, .sysfs_ops = &module_sysfs_ops, }; /* * param_sysfs_init - create "module" kset * * This must be done before the initramfs is unpacked and * request_module() thus becomes possible, because otherwise the * module load would fail in mod_sysfs_init. */ static int __init param_sysfs_init(void) { module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); if (!module_kset) { printk(KERN_WARNING "%s (%d): error creating kset\n", __FILE__, __LINE__); return -ENOMEM; } return 0; } subsys_initcall(param_sysfs_init); /* * param_sysfs_builtin_init - add sysfs version and parameter * attributes for built-in modules */ static int __init param_sysfs_builtin_init(void) { if (!module_kset) return -ENOMEM; version_sysfs_builtin(); param_sysfs_builtin(); return 0; } late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */ |
80 1960 2021 2009 2020 252 1937 51 49 2037 1887 1906 51 51 50 51 51 51 27 51 51 49 4 110 51 51 185 187 184 95 72 25 7 102 1 2010 2037 2019 2016 1899 1884 11 1889 26 2012 1885 2015 2021 1507 1402 2035 25 2021 2020 2037 188 2142 2166 1862 420 425 1074 1074 1074 1075 1031 1338 2152 1874 1653 50 50 1 4 4 4 4 3 1 1 2 4 2 2 6 1 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] static inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * We do not migrate the timer when it is expiring before the next * event on the target cpu. When high resolution is enabled, we cannot * reprogram the target cpu hardware and we would cause it to fire * late. To keep it simple, we handle the high resolution enabled and * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires < new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectively block all timers until the T2 event * fires. */ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } static void retrigger_next_event(void *arg); /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level * resume code. * * This is only invoked when: * - CONFIG_HIGH_RES_TIMERS is enabled. * - CONFIG_NOHZ_COMMON is enabled * * For the other cases this function is empty and because the call sites * are optimized out it vanishes as well, i.e. no need for lots of * #ifdeffery. */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* * When high resolution mode or nohz is active, then the offsets of * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the * next tick will take care of that. * * If high resolution mode is active then the next expiring timer * must be reevaluated and the clock event device reprogrammed if * necessary. * * In the NOHZ case the update of the offset and the reevaluation * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ if (!hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); raw_spin_unlock(&base->lock); } /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; if (expires >= cpu_base->expires_next) return; /* * If the hrtimer interrupt is running, then it will reevaluate the * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; cpu_base->next_timer = timer; __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; ktime_t expires; /* * Update the base offsets unconditionally so the following * checks whether the SMP function call is required works. * * The update is safe even when the remote CPU is in the hrtimer * interrupt or the hrtimer soft interrupt and expiring affected * bases. Either it will see the update before handling a base or * it will see it when it finishes the processing and reevaluates * the next expiring timer. */ seq = cpu_base->clock_was_set_seq; hrtimer_update_base(cpu_base); /* * If the sequence did not change over the update then the * remote CPU already handled it. */ if (seq == cpu_base->clock_was_set_seq) return false; /* * If the remote CPU is currently handling an hrtimer interrupt, it * will reevaluate the first expiring timer of all clock bases * before reprogramming. Nothing to do here. */ if (cpu_base->in_hrtirq) return false; /* * Walk the affected clock bases and check whether the first expiring * timer in a clock base is moving ahead of the first expiring timer of * @cpu_base. If so, the IPI must be invoked because per CPU clock * event devices cannot be remotely reprogrammed. */ active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; next = timerqueue_getnext(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; if (expires < cpu_base->softirq_expires_next) return true; } return false; } /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). * * This requires to update the offsets for these clocks * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this * also requires to eventually reprogram the per CPU clock event devices * when the change moves an affected timer ahead of the first expiring * timer on that CPU. Obviously remote per CPU clock event devices cannot * be reprogrammed. The other reason why an IPI has to be sent is when the * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(unsigned int bases) { struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { on_each_cpu(retrigger_next_event, NULL, 1); goto out_timerfd; } /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } preempt_disable(); smp_call_function_many(mask, retrigger_next_event, NULL, 1); preempt_enable(); cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping code to reprogram the hrtimer interrupt device * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * Called during resume either directly from via timekeeping_resume() * or in the case of s2idle from tick_unfreeze() to ensure that the * hrtimers are up to date. */ void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward() - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * * .. note:: * This only updates the timer expiry value and does not requeue the timer. * * There is also a variant of the function hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the * callback and the caller needs to take care of serialization. * * Return: The number of overruns are returned. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; if (WARN_ON_ONCE(!timer->function)) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) { return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) timer->function = hrtimer_dummy_timeout; else timer->function = function; } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized * @function: the callback function * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); /** * hrtimer_setup_on_stack - initialize a timer on stack memory * @timer: The timer to be initialized * @function: the callback function * @clock_id: The clock to be used * @mode: The timer mode * * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack * memory. */ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * __hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); new_base = &per_cpu(hrtimer_bases, ncpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } |
1165 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ipi #if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IPI_H #include <linux/tracepoint.h> /** * ipi_raise - called when a smp cross call is made * * @mask: mask of recipient CPUs for the IPI * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string. */ TRACE_EVENT(ipi_raise, TP_PROTO(const struct cpumask *mask, const char *reason), TP_ARGS(mask, reason), TP_STRUCT__entry( __bitmask(target_cpus, nr_cpumask_bits) __field(const char *, reason) ), TP_fast_assign( __assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits); __entry->reason = reason; ), TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason) ); TRACE_EVENT(ipi_send_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback), TP_ARGS(cpu, callsite, callback), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, callback) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->callback = callback; ), TP_printk("cpu=%u callsite=%pS callback=%pS", __entry->cpu, __entry->callsite, __entry->callback) ); TRACE_EVENT(ipi_send_cpumask, TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback), TP_ARGS(cpumask, callsite, callback), TP_STRUCT__entry( __cpumask(cpumask) __field(void *, callsite) __field(void *, callback) ), TP_fast_assign( __assign_cpumask(cpumask, cpumask_bits(cpumask)); __entry->callsite = (void *)callsite; __entry->callback = callback; ), TP_printk("cpumask=%s callsite=%pS callback=%pS", __get_cpumask(cpumask), __entry->callsite, __entry->callback) ); DECLARE_EVENT_CLASS(ipi_handler, TP_PROTO(const char *reason), TP_ARGS(reason), TP_STRUCT__entry( __field(const char *, reason) ), TP_fast_assign( __entry->reason = reason; ), TP_printk("(%s)", __entry->reason) ); /** * ipi_entry - called immediately before the IPI handler * * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string, ideally the same as used with trace_ipi_raise * for that IPI. */ DEFINE_EVENT(ipi_handler, ipi_entry, TP_PROTO(const char *reason), TP_ARGS(reason) ); /** * ipi_exit - called immediately after the IPI handler returns * * @reason: string identifying the IPI purpose * * It is necessary for @reason to be a static string declared with * __tracepoint_string, ideally the same as used with trace_ipi_raise for * that IPI. */ DEFINE_EVENT(ipi_handler, ipi_exit, TP_PROTO(const char *reason), TP_ARGS(reason) ); #endif /* _TRACE_IPI_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
5 5 5 5 5 5 17 17 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | // SPDX-License-Identifier: GPL-2.0-or-later /* SHA-512 code by Jean-Luc Cooke <jlcooke@certainkey.com> * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2003 Kyle McMartin <kyle@debian.org> */ #include <crypto/internal/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/crypto.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <linux/percpu.h> #include <asm/byteorder.h> #include <linux/unaligned.h> const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = { 0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38, 0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a, 0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43, 0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda, 0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb, 0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b }; EXPORT_SYMBOL_GPL(sha384_zero_message_hash); const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = { 0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd, 0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07, 0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc, 0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce, 0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0, 0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f, 0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81, 0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e }; EXPORT_SYMBOL_GPL(sha512_zero_message_hash); static inline u64 Ch(u64 x, u64 y, u64 z) { return z ^ (x & (y ^ z)); } static inline u64 Maj(u64 x, u64 y, u64 z) { return (x & y) | (z & (x | y)); } static const u64 sha512_K[80] = { 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, }; #define e0(x) (ror64(x,28) ^ ror64(x,34) ^ ror64(x,39)) #define e1(x) (ror64(x,14) ^ ror64(x,18) ^ ror64(x,41)) #define s0(x) (ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7)) #define s1(x) (ror64(x,19) ^ ror64(x,61) ^ (x >> 6)) static inline void LOAD_OP(int I, u64 *W, const u8 *input) { W[I] = get_unaligned_be64((__u64 *)input + I); } static inline void BLEND_OP(int I, u64 *W) { W[I & 15] += s1(W[(I-2) & 15]) + W[(I-7) & 15] + s0(W[(I-15) & 15]); } static void sha512_transform(u64 *state, const u8 *input) { u64 a, b, c, d, e, f, g, h, t1, t2; int i; u64 W[16]; /* load the state into our registers */ a=state[0]; b=state[1]; c=state[2]; d=state[3]; e=state[4]; f=state[5]; g=state[6]; h=state[7]; /* now iterate */ for (i=0; i<80; i+=8) { if (!(i & 8)) { int j; if (i < 16) { /* load the input */ for (j = 0; j < 16; j++) LOAD_OP(i + j, W, input); } else { for (j = 0; j < 16; j++) { BLEND_OP(i + j, W); } } } t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i ] + W[(i & 15)]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[(i & 15) + 1]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[(i & 15) + 2]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[(i & 15) + 3]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[(i & 15) + 4]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[(i & 15) + 5]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[(i & 15) + 6]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[(i & 15) + 7]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; } state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; state[5] += f; state[6] += g; state[7] += h; } static void sha512_generic_block_fn(struct sha512_state *sst, u8 const *src, int blocks) { while (blocks--) { sha512_transform(sst->state, src); src += SHA512_BLOCK_SIZE; } } int crypto_sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_base_do_update(desc, data, len, sha512_generic_block_fn); } EXPORT_SYMBOL(crypto_sha512_update); static int sha512_final(struct shash_desc *desc, u8 *hash) { sha512_base_do_finalize(desc, sha512_generic_block_fn); return sha512_base_finish(desc, hash); } int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash) { sha512_base_do_update(desc, data, len, sha512_generic_block_fn); return sha512_final(desc, hash); } EXPORT_SYMBOL(crypto_sha512_finup); static struct shash_alg sha512_algs[2] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = crypto_sha512_update, .final = sha512_final, .finup = crypto_sha512_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-generic", .cra_priority = 100, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = crypto_sha512_update, .final = sha512_final, .finup = crypto_sha512_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-generic", .cra_priority = 100, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int __init sha512_generic_mod_init(void) { return crypto_register_shashes(sha512_algs, ARRAY_SIZE(sha512_algs)); } static void __exit sha512_generic_mod_fini(void) { crypto_unregister_shashes(sha512_algs, ARRAY_SIZE(sha512_algs)); } subsys_initcall(sha512_generic_mod_init); module_exit(sha512_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-512 and SHA-384 Secure Hash Algorithms"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha384-generic"); MODULE_ALIAS_CRYPTO("sha512"); MODULE_ALIAS_CRYPTO("sha512-generic"); |
19 207 213 184 210 19 9 9 3 9 2 4 2 1 9 5 7 4 172 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2023 Isovalent */ #ifndef __NET_TCX_H #define __NET_TCX_H #include <linux/bpf.h> #include <linux/bpf_mprog.h> #include <net/sch_generic.h> struct mini_Qdisc; struct tcx_entry { struct mini_Qdisc __rcu *miniq; struct bpf_mprog_bundle bundle; u32 miniq_active; struct rcu_head rcu; }; struct tcx_link { struct bpf_link link; struct net_device *dev; u32 location; }; static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) { #ifdef CONFIG_NET_XGRESS skb->tc_at_ingress = ingress; #endif } #ifdef CONFIG_NET_XGRESS static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry) { struct bpf_mprog_bundle *bundle = entry->parent; return container_of(bundle, struct tcx_entry, bundle); } static inline struct tcx_link *tcx_link(const struct bpf_link *link) { return container_of(link, struct tcx_link, link); } void tcx_inc(void); void tcx_dec(void); static inline void tcx_entry_sync(void) { /* bpf_mprog_entry got a/b swapped, therefore ensure that * there are no inflight users on the old one anymore. */ synchronize_rcu(); } static inline void tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry, bool ingress) { ASSERT_RTNL(); if (ingress) rcu_assign_pointer(dev->tcx_ingress, entry); else rcu_assign_pointer(dev->tcx_egress, entry); } static inline struct bpf_mprog_entry * tcx_entry_fetch(struct net_device *dev, bool ingress) { ASSERT_RTNL(); if (ingress) return rcu_dereference_rtnl(dev->tcx_ingress); else return rcu_dereference_rtnl(dev->tcx_egress); } static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void) { struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL); if (tcx) { bpf_mprog_bundle_init(&tcx->bundle); return &tcx->bundle.a; } return NULL; } #define tcx_entry_create(...) alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__)) static inline void tcx_entry_free(struct bpf_mprog_entry *entry) { kfree_rcu(tcx_entry(entry), rcu); } static inline struct bpf_mprog_entry * tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created) { struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); *created = false; if (!entry) { entry = tcx_entry_create(); if (!entry) return NULL; *created = true; } return entry; } static inline void tcx_skeys_inc(bool ingress) { tcx_inc(); if (ingress) net_inc_ingress_queue(); else net_inc_egress_queue(); } static inline void tcx_skeys_dec(bool ingress) { if (ingress) net_dec_ingress_queue(); else net_dec_egress_queue(); tcx_dec(); } static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); tcx_entry(entry)->miniq_active++; } static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); tcx_entry(entry)->miniq_active--; } static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active; } static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb, int code) { switch (code) { case TCX_PASS: skb->tc_index = qdisc_skb_cb(skb)->tc_classid; fallthrough; case TCX_DROP: case TCX_REDIRECT: return code; case TCX_NEXT: default: return TCX_NEXT; } } #endif /* CONFIG_NET_XGRESS */ #if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL) int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); void tcx_uninstall(struct net_device *dev, bool ingress); int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); static inline void dev_tcx_uninstall(struct net_device *dev) { ASSERT_RTNL(); tcx_uninstall(dev, true); tcx_uninstall(dev, false); } #else static inline int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline void dev_tcx_uninstall(struct net_device *dev) { } #endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */ #endif /* __NET_TCX_H */ |
6 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 * * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * (C) Copyright 2003 Red Hat Inc, All Rights Reserved * * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * * Robust futex support started by Ingo Molnar * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * * PI-futex support started by Ingo Molnar and Thomas Gleixner * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> * Copyright (C) IBM Corporation, 2009 * Thanks to Thomas Gleixner for conceptual design and careful reviews. * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. * * "The futexes are also cursed." * "But they come in a choice of three flavours!" */ #include <linux/compat.h> #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/plist.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they * reside in the same cacheline. */ static struct { struct futex_hash_bucket *queues; unsigned long hashsize; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) #define futex_hashsize (__futex_data.hashsize) /* * Fault injections for futexes. */ #ifdef CONFIG_FAIL_FUTEX static struct { struct fault_attr attr; bool ignore_private; } fail_futex = { .attr = FAULT_ATTR_INITIALIZER, .ignore_private = false, }; static int __init setup_fail_futex(char *str) { return setup_fault_attr(&fail_futex.attr, str); } __setup("fail_futex=", setup_fail_futex); bool should_fail_futex(bool fshared) { if (fail_futex.ignore_private && !fshared) return false; return should_fail(&fail_futex.attr, 1); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_futex_debugfs(void) { umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; dir = fault_create_debugfs_attr("fail_futex", NULL, &fail_futex.attr); if (IS_ERR(dir)) return PTR_ERR(dir); debugfs_create_bool("ignore-private", mode, dir, &fail_futex.ignore_private); return 0; } late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ #endif /* CONFIG_FAIL_FUTEX */ /** * futex_hash - Return the hash bucket in the global hash * @key: Pointer to the futex key for which the hash is calculated * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket in the global hash. */ struct futex_hash_bucket *futex_hash(union futex_key *key) { u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); return &futex_queues[hash & (futex_hashsize - 1)]; } /** * futex_setup_timer - set up the sleeping hrtimer. * @time: ptr to the given timeout value * @timeout: the hrtimer_sleeper structure to be set up * @flags: futex flags * @range_ns: optional range in ns * * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns) { if (!time) return NULL; hrtimer_setup_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, HRTIMER_MODE_ABS); /* * If range_ns is 0, calling hrtimer_set_expires_range_ns() is * effectively the same as calling hrtimer_set_expires(). */ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); return timeout; } /* * Generate a machine wide unique identifier for this inode. * * This relies on u64 not wrapping in the life-time of the machine; which with * 1ns resolution means almost 585 years. * * This further relies on the fact that a well formed program will not unmap * the file while it has a (shared) futex waiting on it. This mapping will have * a file reference which pins the mount and inode. * * If for some reason an inode gets evicted and read back in again, it will get * a new sequence number and will _NOT_ match, even though it is the exact same * file. * * It is important that futex_match() will never have a false-positive, esp. * for PI futexes that can mess up the state. The above argues that false-negatives * are only possible for malformed programs. */ static u64 get_inode_sequence_number(struct inode *inode) { static atomic64_t i_seq; u64 old; /* Does the inode already have a sequence number? */ old = atomic64_read(&inode->i_sequence); if (likely(old)) return old; for (;;) { u64 new = atomic64_inc_return(&i_seq); if (WARN_ON_ONCE(!new)) continue; old = 0; if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) return old; return new; } } /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) * * Return: a negative error code or 0 * * The key words are stored in @key on success. * * For shared mappings (when @fshared), the key is: * * ( inode->i_sequence, page->index, offset_within_page ) * * [ also see get_inode_sequence_number() ] * * For private mappings (or when !@fshared), the key is: * * ( current->mm, address, 0 ) * * This allows (cross process, where applicable) identification of the futex * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct page *page; struct folio *folio; struct address_space *mapping; int err, ro = 0; bool fshared; fshared = flags & FLAGS_SHARED; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; if (unlikely((address % sizeof(u32)) != 0)) return -EINVAL; address -= key->both.offset; if (unlikely(!access_ok(uaddr, sizeof(u32)))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs * virtual address, we dont even have to find the underlying vma. * Note : We do have to check 'uaddr' is a valid user address, * but access_ok() should be faster than find_vma() */ if (!fshared) { /* * On no-MMU, shared futexes are treated as private, therefore * we must not include the current process in the key. Since * there is only one address space, the address is a unique key * on its own. */ if (IS_ENABLED(CONFIG_MMU)) key->private.mm = mm; else key->private.mm = NULL; key->private.address = address; return 0; } again: /* Ignore any VERIFY_READ mapping (futex common case) */ if (unlikely(should_fail_futex(true))) return -EFAULT; err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } if (err < 0) return err; else err = 0; /* * The treatment of mapping from this point on is critical. The folio * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * * Strictly speaking the folio lock is not needed in all cases being * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and * folio lock will be acquired only if it is unavoidable * * Mapping checks require the folio so it is looked up now. For * anonymous pages, it does not matter if the folio is split * in the future as the key is based on the address. For * filesystem-backed pages, the precise page is required as the * index of the page determines the key. */ folio = page_folio(page); mapping = READ_ONCE(folio->mapping); /* * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* * Folio lock is required to identify which special case above * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ folio_lock(folio); shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; folio_unlock(folio); folio_put(folio); if (shmem_swizzled) goto again; return -EFAULT; } /* * Private mappings are handled in a simple way. * * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. */ if (unlikely(should_fail_futex(true)) || ro) { err = -EFAULT; goto out; } key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { struct inode *inode; /* * The associated futex object in this case is the inode and * the folio->mapping must be traversed. Ordinarily this should * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the * mapping->host can be safely accessed as being a valid inode. */ rcu_read_lock(); if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); folio_put(folio); goto again; } inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = page_pgoff(folio, page); rcu_read_unlock(); } out: folio_put(folio); return err; } /** * fault_in_user_writeable() - Fault in user address and verify RW access * @uaddr: pointer to faulting user space address * * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. */ int fault_in_user_writeable(u32 __user *uaddr) { struct mm_struct *mm = current->mm; int ret; mmap_read_lock(mm); ret = fixup_user_fault(mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); mmap_read_unlock(mm); return ret < 0 ? ret : 0; } /** * futex_top_waiter() - Return the highest priority waiter on a futex * @hb: the hash bucket the futex_q's reside in * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) { struct futex_q *this; plist_for_each_entry(this, &hb->chain, list) { if (futex_match(&this->key, key)) return this; } return NULL; } /** * wait_for_owner_exiting - Block until the owner has exited * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. */ void wait_for_owner_exiting(int ret, struct task_struct *exiting) { if (ret != -EBUSY) { WARN_ON_ONCE(exiting); return; } if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; mutex_lock(&exiting->futex_exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can * have any FUTEX_STATE_* value when the waiter has acquired the * mutex. OK, if running, EXITING or DEAD if it reached exit() * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ mutex_unlock(&exiting->futex_exit_mutex); put_task_struct(exiting); } /** * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be NULL and must be held by the caller. */ void __futex_unqueue(struct futex_q *q) { struct futex_hash_bucket *hb; if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) return; lockdep_assert_held(q->lock_ptr); hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); } /* The key must be already stored in q->key. */ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) __acquires(&hb->lock) { struct futex_hash_bucket *hb; hb = futex_hash(&q->key); /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is * waiting for the spinlock. This is safe as all futex_q_lock() * users end up calling futex_queue(). Similarly, for housekeeping, * decrement the counter at futex_q_unlock() when some error has * occurred and we don't end up adding the task to the list. */ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ q->lock_ptr = &hb->lock; spin_lock(&hb->lock); return hb; } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); futex_hb_waiters_dec(hb); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; /* * The priority used to register this element is * - either the real thread-priority for the real-time threads * (i.e. threads with a priority lower than MAX_RT_PRIO) * - or MAX_RT_PRIO for non-RT threads. * Thus, all RT-threads are woken first in priority order, and * the others are woken last, in FIFO order. */ prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; } /** * futex_unqueue() - Remove the futex_q from its futex_hash_bucket * @q: The futex_q to unqueue * * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must * be paired with exactly one earlier call to futex_queue(). * * Return: * - 1 - if the futex_q was still queued (and we removed unqueued it); * - 0 - if the futex_q was already removed by the waking thread */ int futex_unqueue(struct futex_q *q) { spinlock_t *lock_ptr; int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: /* * q->lock_ptr can change between this read and the following spin_lock. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and * optimizing lock_ptr out of the logic below. */ lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. * * Reasoning goes like this: if we have the wrong lock, * q->lock_ptr must have changed (maybe several times) * between reading it and the spin_lock(). It can * change again after the spin_lock() but only if it was * already changed before the spin_lock(). It cannot, * however, change back to the original value. Therefore * we can detect whether we acquired the correct lock. */ if (unlikely(lock_ptr != q->lock_ptr)) { spin_unlock(lock_ptr); goto retry; } __futex_unqueue(q); BUG_ON(q->pi_state); spin_unlock(lock_ptr); ret = 1; } return ret; } /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. */ void futex_unqueue_pi(struct futex_q *q) { /* * If the lock was not acquired (due to timeout or signal) then the * rt_waiter is removed before futex_q is. If this is observed by * an unlocker after dropping the rtmutex wait lock and before * acquiring the hash bucket lock, then the unlocker dequeues the * futex_q from the hash bucket list to guarantee consistent state * vs. userspace. Therefore the dequeue here must be conditional. */ if (!plist_node_empty(&q->list)) __futex_unqueue(q); BUG_ON(!q->pi_state); put_pi_state(q->pi_state); q->pi_state = NULL; } /* Constants for the pending_op argument of handle_futex_death */ #define HANDLE_DEATH_PENDING true #define HANDLE_DEATH_LIST false /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; pid_t owner; int err; /* Futex address must be 32bit aligned */ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) return -1; retry: if (get_user(uval, uaddr)) return -1; /* * Special case for regular (non PI) futexes. The unlock path in * user space has two race scenarios: * * 1. The unlock path releases the user space futex value and * before it can execute the futex() syscall to wake up * waiters it is killed. * * 2. A woken up waiter is killed before it can acquire the * futex in user space. * * In the second case, the wake up notification could be generated * by the unlock path in user space after setting the futex value * to zero or by the kernel after setting the OWNER_DIED bit below. * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. * * In both cases the following conditions are met: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and * trying to set the OWNER_DIED bit. If the futex value is zero, * the rest of the user space mutex state is consistent, so a woken * waiter will just take over the uncontended futex. Setting the * OWNER_DIED bit would create inconsistent state and malfunction * of the user space owner died handling. Otherwise, the OWNER_DIED * bit is already set, and the woken waiter is expected to deal with * this. */ owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); return 0; } if (owner != task_pid_vnr(curr)) return 0; /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically * via cmpxchg, and if the value had FUTEX_WAITERS * set, wake up a waiter (if any). (We have to do a * futex_wake() even if OWNER_DIED is already set - * to handle the rare but possible case of recursive * thread-death.) The rest of the cleanup is done in * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; /* * We are not holding a lock here, but we want to have * the pagefault_disable/enable() protection because * we want to handle the fault gracefully. If the * access fails we try to fault in the futex with R/W * verification via get_user_pages. get_user() above * does not guarantee R/W access. If that fails we * give up and leave the futex locked. */ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { switch (err) { case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; case -EAGAIN: cond_resched(); goto retry; default: WARN_ON_ONCE(1); return err; } } if (nval != uval) goto retry; /* * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, FUTEX_BITSET_MATCH_ANY); } return 0; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, unsigned int *pi) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; *entry = (void __user *)(uentry & ~1UL); *pi = uentry & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; unsigned long futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { handle_futex_death((void __user *)pending + futex_offset, curr, pip, HANDLE_DEATH_PENDING); } } #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); void __user *uaddr = compat_ptr(base + futex_offset); return uaddr; } /* * Fetch a robust-list pointer. Bit 0 signals PI futexes: */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; *entry = compat_ptr((*uentry) & ~1); *pi = (unsigned int)(*uentry) & 1; return 0; } /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: */ if (get_user(futex_offset, &head->futex_offset)) return; /* * Fetch any possibly pending lock-add first, and handle it * if it exists: */ if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; next_entry = NULL; /* avoid warning with gcc */ while (entry != (struct robust_list __user *) &head->list) { /* * Fetch the next entry in the list before calling * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: */ if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); if (handle_futex_death(uaddr, curr, pi, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; pi = next_pi; /* * Avoid excessively long or circular lists: */ if (!--limit) break; cond_resched(); } if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } #endif #ifdef CONFIG_FUTEX_PI /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = futex_hash(&key); /* * We can race against put_pi_state() removing itself from the * list (a waiter going away). put_pi_state() will first * decrement the reference count and then modify the list, so * its possible to see the list entry but fail this reference * acquire. * * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); continue; } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: */ if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } WARN_ON(pi_state->owner != curr); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); } #else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif static void futex_cleanup(struct task_struct *tsk) { if (unlikely(tsk->robust_list)) { exit_robust_list(tsk); tsk->robust_list = NULL; } #ifdef CONFIG_COMPAT if (unlikely(tsk->compat_robust_list)) { compat_exit_robust_list(tsk); tsk->compat_robust_list = NULL; } #endif if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); } /** * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD * @tsk: task to set the state on * * Set the futex exit state of the task lockless. The futex waiter code * observes that state when a task is exiting and loops until the task has * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can * take it over. If not, the problem is pushed back to user space. If the * futex exit code did not run yet, then an already queued waiter might * block forever, but there is nothing which can be done about that. */ void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ if (tsk->futex_state == FUTEX_STATE_EXITING) mutex_unlock(&tsk->futex_exit_mutex); tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ mutex_lock(&tsk->futex_exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. * * This ensures that all subsequent checks of tsk->futex_state in * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with * tsk->pi_lock held. * * It guarantees also that a pi_state which was queued right before * the state change under tsk->pi_lock by a concurrent waiter must * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); tsk->futex_state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ tsk->futex_state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ mutex_unlock(&tsk->futex_exit_mutex); } void futex_exec_release(struct task_struct *tsk) { /* * The state handling is done for consistency, but in the case of * exec() there is no way to prevent further damage as the PID stays * the same. But for the unlikely and arguably buggy case that a * futex is held on exec(), this provides at least as much state * consistency protection which is possible. */ futex_cleanup_begin(tsk); futex_cleanup(tsk); /* * Reset the state to FUTEX_STATE_OK. The task is alive and about * exec a new binary. */ futex_cleanup_end(tsk, FUTEX_STATE_OK); } void futex_exit_release(struct task_struct *tsk) { futex_cleanup_begin(tsk); futex_cleanup(tsk); futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } static int __init futex_init(void) { unsigned int futex_shift; unsigned long i; #ifdef CONFIG_BASE_SMALL futex_hashsize = 16; #else futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), futex_hashsize, 0, 0, &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; for (i = 0; i < futex_hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } return 0; } core_initcall(futex_init); |
2 1 1 2 2 1 2 1 1 1 1 1 1 3 3 1 1 1 1 1 3 2 1 2 1 5 1 1 3 3 4 3 1 1 1 1 1 4 1 1 2 2 2 2 2 2 5 1 1 1 2 1 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 | // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> #include <linux/bpf-netns.h> #include <linux/filter.h> #include <net/net_namespace.h> /* * Functions to manage BPF programs attached to netns */ struct bpf_netns_link { struct bpf_link link; enum bpf_attach_type type; enum netns_bpf_attach_type netns_type; /* We don't hold a ref to net in order to auto-detach the link * when netns is going away. Instead we rely on pernet * pre_exit callback to clear this pointer. Must be accessed * with netns_bpf_mutex held. */ struct net *net; struct list_head node; /* node in list of links attached to net */ }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_dec(&bpf_sk_lookup_enabled); break; #endif default: break; } } static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type) { switch (type) { #ifdef CONFIG_INET case NETNS_BPF_SK_LOOKUP: static_branch_inc(&bpf_sk_lookup_enabled); break; #endif default: break; } } /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) { struct bpf_prog_array *run_array; run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); } static int link_index(struct net *net, enum netns_bpf_attach_type type, struct bpf_netns_link *link) { struct bpf_netns_link *pos; int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { if (pos == link) return i; i++; } return -ENOENT; } static int link_count(struct net *net, enum netns_bpf_attach_type type) { struct list_head *pos; int i = 0; list_for_each(pos, &net->bpf.links[type]) i++; return i; } static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog_array *prog_array) { struct bpf_netns_link *pos; unsigned int i = 0; list_for_each_entry(pos, &net->bpf.links[type], node) { prog_array->items[i].prog = pos->link.prog; i++; } } static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *old_array, *new_array; struct net *net; int cnt, idx; mutex_lock(&netns_bpf_mutex); /* We can race with cleanup_net, but if we see a non-NULL * struct net pointer, pre_exit has not run yet and wait for * netns_bpf_mutex. */ net = net_link->net; if (!net) goto out_unlock; /* Mark attach point as unused */ netns_bpf_attach_type_unneed(type); /* Remember link position in case of safe delete */ idx = link_index(net, type, net_link); list_del(&net_link->node); cnt = link_count(net, type); if (!cnt) { netns_bpf_run_array_detach(net, type); goto out_unlock; } old_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL); if (!new_array) { WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx)); goto out_unlock; } fill_prog_array(net, type, new_array); rcu_assign_pointer(net->bpf.run_array[type], new_array); bpf_prog_array_free(old_array); out_unlock: net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } static int bpf_netns_link_detach(struct bpf_link *link) { bpf_netns_link_release(link); return 0; } static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); kfree(net_link); } static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; struct bpf_prog_array *run_array; struct net *net; int idx, ret; if (old_prog && old_prog != link->prog) return -EPERM; if (new_prog->type != link->prog->type) return -EINVAL; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (!net || !check_net(net)) { /* Link auto-detached or netns dying */ ret = -ENOLINK; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); idx = link_index(net, type, net_link); ret = bpf_prog_array_update_at(run_array, idx, new_prog); if (ret) goto out_unlock; old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } static int bpf_netns_link_fill_info(const struct bpf_link *link, struct bpf_link_info *info) { const struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); unsigned int inum = 0; struct net *net; mutex_lock(&netns_bpf_mutex); net = net_link->net; if (net && check_net(net)) inum = net->ns.inum; mutex_unlock(&netns_bpf_mutex); info->netns.netns_ino = inum; info->netns.attach_type = net_link->type; return 0; } static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_link_info info = {}; bpf_netns_link_fill_info(link, &info); seq_printf(seq, "netns_ino:\t%u\n" "attach_type:\t%u\n", info.netns.netns_ino, info.netns.attach_type); } static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, }; /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct net *net, enum netns_bpf_attach_type type) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); struct bpf_prog_array *run_array; u32 prog_cnt = 0, flags = 0; run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) prog_cnt = bpf_prog_array_length(run_array); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) return -EFAULT; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) return 0; return bpf_prog_array_copy_to_user(run_array, prog_ids, attr->query.prog_cnt); } int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { enum netns_bpf_attach_type type; struct net *net; int ret; if (attr->query.query_flags) return -EINVAL; type = to_netns_bpf_attach_type(attr->query.attach_type); if (type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_query(attr, uattr, net, type); mutex_unlock(&netns_bpf_mutex); put_net(net); return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; int ret; if (attr->target_fd || attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; net = current->nsproxy->net_ns; mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } if (ret) goto out_unlock; attached = net->bpf.progs[type]; if (attached == prog) { /* The same program cannot be attached twice */ ret = -EINVAL; goto out_unlock; } run_array = rcu_dereference_protected(net->bpf.run_array[type], lockdep_is_held(&netns_bpf_mutex)); if (run_array) { WRITE_ONCE(run_array->items[0].prog, prog); } else { run_array = bpf_prog_array_alloc(1, GFP_KERNEL); if (!run_array) { ret = -ENOMEM; goto out_unlock; } run_array->items[0].prog = prog; rcu_assign_pointer(net->bpf.run_array[type], run_array); } net->bpf.progs[type] = prog; if (attached) bpf_prog_put(attached); out_unlock: mutex_unlock(&netns_bpf_mutex); return ret; } /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, enum netns_bpf_attach_type type, struct bpf_prog *old) { struct bpf_prog *attached; /* Progs attached via links cannot be detached */ if (!list_empty(&net->bpf.links[type])) return -EINVAL; attached = net->bpf.progs[type]; if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; struct bpf_prog *prog; int ret; if (attr->target_fd) return -EINVAL; type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); mutex_lock(&netns_bpf_mutex); ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); bpf_prog_put(prog); return ret; } static int netns_bpf_max_progs(enum netns_bpf_attach_type type) { switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; case NETNS_BPF_SK_LOOKUP: return 64; default: return 0; } } static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; int cnt, err; mutex_lock(&netns_bpf_mutex); cnt = link_count(net, type); if (cnt >= netns_bpf_max_progs(type)) { err = -E2BIG; goto out_unlock; } /* Links are not compatible with attaching prog directly */ if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } switch (type) { case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; case NETNS_BPF_SK_LOOKUP: err = 0; /* nothing to check */ break; default: err = -EINVAL; break; } if (err) goto out_unlock; run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL); if (!run_array) { err = -ENOMEM; goto out_unlock; } list_add_tail(&net_link->node, &net->bpf.links[type]); fill_prog_array(net, type, run_array); run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array, lockdep_is_held(&netns_bpf_mutex)); bpf_prog_array_free(run_array); /* Mark attach point as used */ netns_bpf_attach_type_need(type); out_unlock: mutex_unlock(&netns_bpf_mutex); return err; } int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) { enum netns_bpf_attach_type netns_type; struct bpf_link_primer link_primer; struct bpf_netns_link *net_link; enum bpf_attach_type type; struct net *net; int err; if (attr->link_create.flags) return -EINVAL; type = attr->link_create.attach_type; netns_type = to_netns_bpf_attach_type(type); if (netns_type < 0) return -EINVAL; net = get_net_ns_by_fd(attr->link_create.target_fd); if (IS_ERR(net)) return PTR_ERR(net); net_link = kzalloc(sizeof(*net_link), GFP_USER); if (!net_link) { err = -ENOMEM; goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, &bpf_netns_link_ops, prog); net_link->net = net; net_link->type = type; net_link->netns_type = netns_type; err = bpf_link_prime(&net_link->link, &link_primer); if (err) { kfree(net_link); goto out_put_net; } err = netns_bpf_link_attach(net, &net_link->link, netns_type); if (err) { bpf_link_cleanup(&link_primer); goto out_put_net; } put_net(net); return bpf_link_settle(&link_primer); out_put_net: put_net(net); return err; } static int __net_init netns_bpf_pernet_init(struct net *net) { int type; for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) INIT_LIST_HEAD(&net->bpf.links[type]); return 0; } static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); list_for_each_entry(net_link, &net->bpf.links[type], node) { net_link->net = NULL; /* auto-detach link */ netns_bpf_attach_type_unneed(type); } if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; static int __init netns_bpf_init(void) { return register_pernet_subsys(&netns_bpf_pernet_ops); } subsys_initcall(netns_bpf_init); |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | // SPDX-License-Identifier: GPL-2.0-only /* * Handshake request lifetime events * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2023, Oracle and/or its affiliates. */ #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet.h> #include <linux/rhashtable.h> #include <net/sock.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <kunit/visibility.h> #include <uapi/linux/handshake.h> #include "handshake.h" #include <trace/events/handshake.h> /* * We need both a handshake_req -> sock mapping, and a sock -> * handshake_req mapping. Both are one-to-one. * * To avoid adding another pointer field to struct sock, net/handshake * maintains a hash table, indexed by the memory address of @sock, to * find the struct handshake_req outstanding for that socket. The * reverse direction uses a simple pointer field in the handshake_req * struct. */ static struct rhashtable handshake_rhashtbl ____cacheline_aligned_in_smp; static const struct rhashtable_params handshake_rhash_params = { .key_len = sizeof_field(struct handshake_req, hr_sk), .key_offset = offsetof(struct handshake_req, hr_sk), .head_offset = offsetof(struct handshake_req, hr_rhash), .automatic_shrinking = true, }; int handshake_req_hash_init(void) { return rhashtable_init(&handshake_rhashtbl, &handshake_rhash_params); } void handshake_req_hash_destroy(void) { rhashtable_destroy(&handshake_rhashtbl); } struct handshake_req *handshake_req_hash_lookup(struct sock *sk) { return rhashtable_lookup_fast(&handshake_rhashtbl, &sk, handshake_rhash_params); } EXPORT_SYMBOL_IF_KUNIT(handshake_req_hash_lookup); static bool handshake_req_hash_add(struct handshake_req *req) { int ret; ret = rhashtable_lookup_insert_fast(&handshake_rhashtbl, &req->hr_rhash, handshake_rhash_params); return ret == 0; } static void handshake_req_destroy(struct handshake_req *req) { if (req->hr_proto->hp_destroy) req->hr_proto->hp_destroy(req); rhashtable_remove_fast(&handshake_rhashtbl, &req->hr_rhash, handshake_rhash_params); kfree(req); } static void handshake_sk_destruct(struct sock *sk) { void (*sk_destruct)(struct sock *sk); struct handshake_req *req; req = handshake_req_hash_lookup(sk); if (!req) return; trace_handshake_destruct(sock_net(sk), req, sk); sk_destruct = req->hr_odestruct; handshake_req_destroy(req); if (sk_destruct) sk_destruct(sk); } /** * handshake_req_alloc - Allocate a handshake request * @proto: security protocol * @flags: memory allocation flags * * Returns an initialized handshake_req or NULL. */ struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto, gfp_t flags) { struct handshake_req *req; if (!proto) return NULL; if (proto->hp_handler_class <= HANDSHAKE_HANDLER_CLASS_NONE) return NULL; if (proto->hp_handler_class >= HANDSHAKE_HANDLER_CLASS_MAX) return NULL; if (!proto->hp_accept || !proto->hp_done) return NULL; req = kzalloc(struct_size(req, hr_priv, proto->hp_privsize), flags); if (!req) return NULL; INIT_LIST_HEAD(&req->hr_list); req->hr_proto = proto; return req; } EXPORT_SYMBOL(handshake_req_alloc); /** * handshake_req_private - Get per-handshake private data * @req: handshake arguments * */ void *handshake_req_private(struct handshake_req *req) { return (void *)&req->hr_priv; } EXPORT_SYMBOL(handshake_req_private); static bool __add_pending_locked(struct handshake_net *hn, struct handshake_req *req) { if (WARN_ON_ONCE(!list_empty(&req->hr_list))) return false; hn->hn_pending++; list_add_tail(&req->hr_list, &hn->hn_requests); return true; } static void __remove_pending_locked(struct handshake_net *hn, struct handshake_req *req) { hn->hn_pending--; list_del_init(&req->hr_list); } /* * Returns %true if the request was found on @net's pending list, * otherwise %false. * * If @req was on a pending list, it has not yet been accepted. */ static bool remove_pending(struct handshake_net *hn, struct handshake_req *req) { bool ret = false; spin_lock(&hn->hn_lock); if (!list_empty(&req->hr_list)) { __remove_pending_locked(hn, req); ret = true; } spin_unlock(&hn->hn_lock); return ret; } struct handshake_req *handshake_req_next(struct handshake_net *hn, int class) { struct handshake_req *req, *pos; req = NULL; spin_lock(&hn->hn_lock); list_for_each_entry(pos, &hn->hn_requests, hr_list) { if (pos->hr_proto->hp_handler_class != class) continue; __remove_pending_locked(hn, pos); req = pos; break; } spin_unlock(&hn->hn_lock); return req; } EXPORT_SYMBOL_IF_KUNIT(handshake_req_next); /** * handshake_req_submit - Submit a handshake request * @sock: open socket on which to perform the handshake * @req: handshake arguments * @flags: memory allocation flags * * Return values: * %0: Request queued * %-EINVAL: Invalid argument * %-EBUSY: A handshake is already under way for this socket * %-ESRCH: No handshake agent is available * %-EAGAIN: Too many pending handshake requests * %-ENOMEM: Failed to allocate memory * %-EMSGSIZE: Failed to construct notification message * %-EOPNOTSUPP: Handshake module not initialized * * A zero return value from handshake_req_submit() means that * exactly one subsequent completion callback is guaranteed. * * A negative return value from handshake_req_submit() means that * no completion callback will be done and that @req has been * destroyed. */ int handshake_req_submit(struct socket *sock, struct handshake_req *req, gfp_t flags) { struct handshake_net *hn; struct net *net; int ret; if (!sock || !req || !sock->file) { kfree(req); return -EINVAL; } req->hr_sk = sock->sk; if (!req->hr_sk) { kfree(req); return -EINVAL; } req->hr_odestruct = req->hr_sk->sk_destruct; req->hr_sk->sk_destruct = handshake_sk_destruct; ret = -EOPNOTSUPP; net = sock_net(req->hr_sk); hn = handshake_pernet(net); if (!hn) goto out_err; ret = -EAGAIN; if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max) goto out_err; spin_lock(&hn->hn_lock); ret = -EOPNOTSUPP; if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags)) goto out_unlock; ret = -EBUSY; if (!handshake_req_hash_add(req)) goto out_unlock; if (!__add_pending_locked(hn, req)) goto out_unlock; spin_unlock(&hn->hn_lock); ret = handshake_genl_notify(net, req->hr_proto, flags); if (ret) { trace_handshake_notify_err(net, req, req->hr_sk, ret); if (remove_pending(hn, req)) goto out_err; } /* Prevent socket release while a handshake request is pending */ sock_hold(req->hr_sk); trace_handshake_submit(net, req, req->hr_sk); return 0; out_unlock: spin_unlock(&hn->hn_lock); out_err: trace_handshake_submit_err(net, req, req->hr_sk, ret); handshake_req_destroy(req); return ret; } EXPORT_SYMBOL(handshake_req_submit); void handshake_complete(struct handshake_req *req, unsigned int status, struct genl_info *info) { struct sock *sk = req->hr_sk; struct net *net = sock_net(sk); if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { trace_handshake_complete(net, req, sk, status); req->hr_proto->hp_done(req, status, info); /* Handshake request is no longer pending */ sock_put(sk); } } EXPORT_SYMBOL_IF_KUNIT(handshake_complete); /** * handshake_req_cancel - Cancel an in-progress handshake * @sk: socket on which there is an ongoing handshake * * Request cancellation races with request completion. To determine * who won, callers examine the return value from this function. * * Return values: * %true - Uncompleted handshake request was canceled * %false - Handshake request already completed or not found */ bool handshake_req_cancel(struct sock *sk) { struct handshake_req *req; struct handshake_net *hn; struct net *net; net = sock_net(sk); req = handshake_req_hash_lookup(sk); if (!req) { trace_handshake_cancel_none(net, req, sk); return false; } hn = handshake_pernet(net); if (hn && remove_pending(hn, req)) { /* Request hadn't been accepted */ goto out_true; } if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { /* Request already completed */ trace_handshake_cancel_busy(net, req, sk); return false; } out_true: trace_handshake_cancel(net, req, sk); /* Handshake request is no longer pending */ sock_put(sk); return true; } EXPORT_SYMBOL(handshake_req_cancel); |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mmap #if !defined(_TRACE_MMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_H #include <linux/tracepoint.h> TRACE_EVENT(vm_unmapped_area, TP_PROTO(unsigned long addr, struct vm_unmapped_area_info *info), TP_ARGS(addr, info), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, total_vm) __field(unsigned long, flags) __field(unsigned long, length) __field(unsigned long, low_limit) __field(unsigned long, high_limit) __field(unsigned long, align_mask) __field(unsigned long, align_offset) ), TP_fast_assign( __entry->addr = addr; __entry->total_vm = current->mm->total_vm; __entry->flags = info->flags; __entry->length = info->length; __entry->low_limit = info->low_limit; __entry->high_limit = info->high_limit; __entry->align_mask = info->align_mask; __entry->align_offset = info->align_offset; ), TP_printk("addr=0x%lx err=%ld total_vm=0x%lx flags=0x%lx len=0x%lx lo=0x%lx hi=0x%lx mask=0x%lx ofs=0x%lx", IS_ERR_VALUE(__entry->addr) ? 0 : __entry->addr, IS_ERR_VALUE(__entry->addr) ? __entry->addr : 0, __entry->total_vm, __entry->flags, __entry->length, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); TRACE_EVENT(vma_mas_szero, TP_PROTO(struct maple_tree *mt, unsigned long start, unsigned long end), TP_ARGS(mt, start, end), TP_STRUCT__entry( __field(struct maple_tree *, mt) __field(unsigned long, start) __field(unsigned long, end) ), TP_fast_assign( __entry->mt = mt; __entry->start = start; __entry->end = end; ), TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,", __entry->mt, (unsigned long) __entry->start, (unsigned long) __entry->end ) ); TRACE_EVENT(vma_store, TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma), TP_ARGS(mt, vma), TP_STRUCT__entry( __field(struct maple_tree *, mt) __field(struct vm_area_struct *, vma) __field(unsigned long, vm_start) __field(unsigned long, vm_end) ), TP_fast_assign( __entry->mt = mt; __entry->vma = vma; __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end - 1; ), TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,", __entry->mt, __entry->vma, (unsigned long) __entry->vm_start, (unsigned long) __entry->vm_end ) ); TRACE_EVENT(exit_mmap, TP_PROTO(struct mm_struct *mm), TP_ARGS(mm), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(struct maple_tree *, mt) ), TP_fast_assign( __entry->mm = mm; __entry->mt = &mm->mm_mt; ), TP_printk("mt_mod %p, DESTROY", __entry->mt ) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h> |
27 27 102 25 2 25 25 25 25 1 1 1 1 1 1 1 1 1 8 8 1 25 25 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp tsn mapping array. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Jon Grimm <jgrimm@us.ibm.com> * Karl Knutson <karl@athena.chicago.il.us> * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/bitmap.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static void sctp_tsnmap_update(struct sctp_tsnmap *map); static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, __u16 len, __u16 *start, __u16 *end); static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size); /* Initialize a block of memory as a tsnmap. */ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len, __u32 initial_tsn, gfp_t gfp) { if (!map->tsn_map) { map->tsn_map = kzalloc(len>>3, gfp); if (map->tsn_map == NULL) return NULL; map->len = len; } else { bitmap_zero(map->tsn_map, map->len); } /* Keep track of TSNs represented by tsn_map. */ map->base_tsn = initial_tsn; map->cumulative_tsn_ack_point = initial_tsn - 1; map->max_tsn_seen = map->cumulative_tsn_ack_point; map->num_dup_tsns = 0; return map; } void sctp_tsnmap_free(struct sctp_tsnmap *map) { map->len = 0; kfree(map->tsn_map); } /* Test the tracking state of this TSN. * Returns: * 0 if the TSN has not yet been seen * >0 if the TSN has been seen (duplicate) * <0 if the TSN is invalid (too large to track) */ int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn) { u32 gap; /* Check to see if this is an old TSN */ if (TSN_lte(tsn, map->cumulative_tsn_ack_point)) return 1; /* Verify that we can hold this TSN and that it will not * overflow our map */ if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) return -1; /* Calculate the index into the mapping arrays. */ gap = tsn - map->base_tsn; /* Check to see if TSN has already been recorded. */ if (gap < map->len && test_bit(gap, map->tsn_map)) return 1; else return 0; } /* Mark this TSN as seen. */ int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn, struct sctp_transport *trans) { u16 gap; if (TSN_lt(tsn, map->base_tsn)) return 0; gap = tsn - map->base_tsn; if (gap >= map->len && !sctp_tsnmap_grow(map, gap + 1)) return -ENOMEM; if (!sctp_tsnmap_has_gap(map) && gap == 0) { /* In this case the map has no gaps and the tsn we are * recording is the next expected tsn. We don't touch * the map but simply bump the values. */ map->max_tsn_seen++; map->cumulative_tsn_ack_point++; if (trans) trans->sack_generation = trans->asoc->peer.sack_generation; map->base_tsn++; } else { /* Either we already have a gap, or about to record a gap, so * have work to do. * * Bump the max. */ if (TSN_lt(map->max_tsn_seen, tsn)) map->max_tsn_seen = tsn; /* Mark the TSN as received. */ set_bit(gap, map->tsn_map); /* Go fixup any internal TSN mapping variables including * cumulative_tsn_ack_point. */ sctp_tsnmap_update(map); } return 0; } /* Initialize a Gap Ack Block iterator from memory being provided. */ static void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map, struct sctp_tsnmap_iter *iter) { /* Only start looking one past the Cumulative TSN Ack Point. */ iter->start = map->cumulative_tsn_ack_point + 1; } /* Get the next Gap Ack Blocks. Returns 0 if there was not another block * to get. */ static int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map, struct sctp_tsnmap_iter *iter, __u16 *start, __u16 *end) { int ended = 0; __u16 start_ = 0, end_ = 0, offset; /* If there are no more gap acks possible, get out fast. */ if (TSN_lte(map->max_tsn_seen, iter->start)) return 0; offset = iter->start - map->base_tsn; sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len, &start_, &end_); /* The Gap Ack Block happens to end at the end of the map. */ if (start_ && !end_) end_ = map->len - 1; /* If we found a Gap Ack Block, return the start and end and * bump the iterator forward. */ if (end_) { /* Fix up the start and end based on the * Cumulative TSN Ack which is always 1 behind base. */ *start = start_ + 1; *end = end_ + 1; /* Move the iterator forward. */ iter->start = map->cumulative_tsn_ack_point + *end + 1; ended = 1; } return ended; } /* Mark this and any lower TSN as seen. */ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn) { u32 gap; if (TSN_lt(tsn, map->base_tsn)) return; if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) return; /* Bump the max. */ if (TSN_lt(map->max_tsn_seen, tsn)) map->max_tsn_seen = tsn; gap = tsn - map->base_tsn + 1; map->base_tsn += gap; map->cumulative_tsn_ack_point += gap; if (gap >= map->len) { /* If our gap is larger then the map size, just * zero out the map. */ bitmap_zero(map->tsn_map, map->len); } else { /* If the gap is smaller than the map size, * shift the map by 'gap' bits and update further. */ bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len); sctp_tsnmap_update(map); } } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private helper function updates the tsnmap buffers and * the Cumulative TSN Ack Point. */ static void sctp_tsnmap_update(struct sctp_tsnmap *map) { u16 len; unsigned long zero_bit; len = map->max_tsn_seen - map->cumulative_tsn_ack_point; zero_bit = find_first_zero_bit(map->tsn_map, len); if (!zero_bit) return; /* The first 0-bit is bit 0. nothing to do */ map->base_tsn += zero_bit; map->cumulative_tsn_ack_point += zero_bit; bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len); } /* How many data chunks are we missing from our peer? */ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map) { __u32 cum_tsn = map->cumulative_tsn_ack_point; __u32 max_tsn = map->max_tsn_seen; __u32 base_tsn = map->base_tsn; __u16 pending_data; u32 gap; pending_data = max_tsn - cum_tsn; gap = max_tsn - base_tsn; if (gap == 0 || gap >= map->len) goto out; pending_data -= bitmap_weight(map->tsn_map, gap + 1); out: return pending_data; } /* This is a private helper for finding Gap Ack Blocks. It searches a * single array for the start and end of a Gap Ack Block. * * The flags "started" and "ended" tell is if we found the beginning * or (respectively) the end of a Gap Ack Block. */ static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, __u16 len, __u16 *start, __u16 *end) { int i = off; /* Look through the entire array, but break out * early if we have found the end of the Gap Ack Block. */ /* Also, stop looking past the maximum TSN seen. */ /* Look for the start. */ i = find_next_bit(map, len, off); if (i < len) *start = i; /* Look for the end. */ if (*start) { /* We have found the start, let's find the * end. If we find the end, break out. */ i = find_next_zero_bit(map, len, i); if (i < len) *end = i - 1; } } /* Renege that we have seen a TSN. */ void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn) { u32 gap; if (TSN_lt(tsn, map->base_tsn)) return; /* Assert: TSN is in range. */ if (!TSN_lt(tsn, map->base_tsn + map->len)) return; gap = tsn - map->base_tsn; /* Pretend we never saw the TSN. */ clear_bit(gap, map->tsn_map); } /* How many gap ack blocks do we have recorded? */ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, struct sctp_gap_ack_block *gabs) { struct sctp_tsnmap_iter iter; int ngaps = 0; /* Refresh the gap ack information. */ if (sctp_tsnmap_has_gap(map)) { __u16 start = 0, end = 0; sctp_tsnmap_iter_init(map, &iter); while (sctp_tsnmap_next_gap_ack(map, &iter, &start, &end)) { gabs[ngaps].start = htons(start); gabs[ngaps].end = htons(end); ngaps++; if (ngaps >= SCTP_MAX_GABS) break; } } return ngaps; } static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size) { unsigned long *new; unsigned long inc; u16 len; if (size > SCTP_TSN_MAP_SIZE) return 0; inc = ALIGN((size - map->len), BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT; len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE); new = kzalloc(len>>3, GFP_ATOMIC); if (!new) return 0; bitmap_copy(new, map->tsn_map, map->max_tsn_seen - map->cumulative_tsn_ack_point); kfree(map->tsn_map); map->tsn_map = new; map->len = len; return 1; } |
3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> #include "blk.h" struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pdata_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */ |
1 10 10 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/path.h> #include <linux/slab.h> #include <linux/fs_struct.h> #include "internal.h" /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_root(struct fs_struct *fs, const struct path *path) { struct path old_root; path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_root = fs->root; fs->root = *path; write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_root.dentry) path_put(&old_root); } /* * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_pwd(struct fs_struct *fs, const struct path *path) { struct path old_pwd; path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_pwd.dentry) path_put(&old_pwd); } static inline int replace_path(struct path *p, const struct path *old, const struct path *new) { if (likely(p->dentry != old->dentry || p->mnt != old->mnt)) return 0; *p = *new; return 1; } void chroot_fs_refs(const struct path *old_root, const struct path *new_root) { struct task_struct *g, *p; struct fs_struct *fs; int count = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { task_lock(p); fs = p->fs; if (fs) { int hits = 0; spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); write_seqcount_end(&fs->seq); while (hits--) { count++; path_get(new_root); } spin_unlock(&fs->lock); } task_unlock(p); } read_unlock(&tasklist_lock); while (count--) path_put(old_root); } void free_fs_struct(struct fs_struct *fs) { path_put(&fs->root); path_put(&fs->pwd); kmem_cache_free(fs_cachep, fs); } void exit_fs(struct task_struct *tsk) { struct fs_struct *fs = tsk->fs; if (fs) { int kill; task_lock(tsk); spin_lock(&fs->lock); tsk->fs = NULL; kill = !--fs->users; spin_unlock(&fs->lock); task_unlock(tsk); if (kill) free_fs_struct(fs); } } struct fs_struct *copy_fs_struct(struct fs_struct *old) { struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don't need to lock fs - think why ;-) */ if (fs) { fs->users = 1; fs->in_exec = 0; spin_lock_init(&fs->lock); seqcount_spinlock_init(&fs->seq, &fs->lock); fs->umask = old->umask; spin_lock(&old->lock); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); spin_unlock(&old->lock); } return fs; } int unshare_fs_struct(void) { struct fs_struct *fs = current->fs; struct fs_struct *new_fs = copy_fs_struct(fs); int kill; if (!new_fs) return -ENOMEM; task_lock(current); spin_lock(&fs->lock); kill = !--fs->users; current->fs = new_fs; spin_unlock(&fs->lock); task_unlock(current); if (kill) free_fs_struct(fs); return 0; } EXPORT_SYMBOL_GPL(unshare_fs_struct); int current_umask(void) { return current->fs->umask; } EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), .umask = 0022, }; |
2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 | // SPDX-License-Identifier: GPL-2.0 /* * Functions to sequence PREFLUSH and FUA writes. * * Copyright (C) 2011 Max Planck Institute for Gravitational Physics * Copyright (C) 2011 Tejun Heo <tj@kernel.org> * * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request * properties and hardware capability. * * If a request doesn't have data, only REQ_PREFLUSH makes sense, which * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates * that the device cache should be flushed before the data is executed, and * REQ_FUA means that the data must be on non-volatile media on request * completion. * * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any * difference. The requests are either completed immediately if there's no data * or executed as normal requests otherwise. * * If the device has writeback cache and supports FUA, REQ_PREFLUSH is * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. * * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of PREFLUSH/FUA * requests. * * Currently, the following conditions are used to determine when to issue * flush. * * C1. At any given time, only one flush shall be in progress. This makes * double buffering sufficient. * * C2. Flush is deferred if any request is executing DATA of its sequence. * This avoids issuing separate POSTFLUSHes for requests which shared * PREFLUSH. * * C3. The second condition is ignored if there is a request which has * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid * starvation in the unlikely case where there are continuous stream of * FUA (without PREFLUSH) requests. * * For devices which support FUA, it isn't clear whether C2 (and thus C3) * is beneficial. * * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * * The above peculiarity requires that each PREFLUSH/FUA request has only one * bio attached to it, which is guaranteed as they aren't allowed to be * merged in the usual way. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* PREFLUSH/FUA sequences */ enum { REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ REQ_FSEQ_DONE = (1 << 3), REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH, /* * If flush has been pending longer than the following timeout, * it's issued even if flush_data requests are still in flight. */ FLUSH_PENDING_TIMEOUT = 5 * HZ, }; static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) { return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) { return 1 << ffz(rq->flush.seq); } static void blk_flush_restore_request(struct request *rq) { /* * After flush data completion, @rq->bio is %NULL but we need to * complete the bio again. @rq->biotail is guaranteed to equal the * original @rq->bio. Restore it. */ rq->bio = rq->biotail; if (rq->bio) rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; } static void blk_account_io_flush(struct request *rq) { struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } /** * blk_flush_complete_seq - complete flush sequence * @rq: PREFLUSH/FUA request being sequenced * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred * * @rq just completed @seq part of its flush sequence, record the * completion and trigger the next step. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) */ static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; blk_opf_t cmd_flags; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; cmd_flags = rq->cmd_flags; if (likely(!error)) seq = blk_flush_cur_seq(rq); else seq = REQ_FSEQ_DONE; switch (seq) { case REQ_FSEQ_PREFLUSH: case REQ_FSEQ_POSTFLUSH: /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; list_add_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; case REQ_FSEQ_DONE: /* * @rq was previously adjusted by blk_insert_flush() for * flush sequencing and may already have gone through the * flush data request completion path. Restore @rq for * normal completion and end it. */ list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; default: BUG(); } blk_kick_flush(q, fq, cmd_flags); } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } blk_account_io_flush(flush_rq); /* * Flush request has to be marked as IDLE when it is really ended * because its .end_io() is called from timeout code path too for * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; } if (!q->elevator) { flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); /* account completion of the flush request */ fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); list_del_init(&rq->queuelist); blk_flush_complete_seq(rq, fq, seq, error); } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } bool is_flush_rq(struct request *rq) { return rq->end_io == flush_end_io; } /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked * @fq: flush queue * @flags: cmd_flags of the original request * * Flush related states of @q have changed, consider issuing flush request. * Please read the comment at the top of this file for more info. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) * */ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; /* C2 and C3 */ if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; /* * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); /* * In case of none scheduler, borrow tag from the first request * since they can't be in flight at the same time. And acquire * the tag's ownership for flush req. * * In case of IO scheduler, flush rq need to borrow scheduler tag * just for cheating put/get driver tag. */ flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; if (!q->elevator) flush_rq->tag = first_rq->tag; else flush_rq->internal_tag = first_rq->internal_tag; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref * and READ flush_rq->end_io */ smp_wmb(); req_ref_set(flush_rq, 1); spin_lock(&q->requeue_lock); list_add_tail(&flush_rq->queuelist, &q->flush_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); if (q->elevator) { WARN_ON(rq->tag < 0); blk_mq_put_driver_tag(rq); } /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; /* * May have been corrupted by rq->rq_next reuse, we need to * re-initialize rq->queuelist before reusing it here. */ INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); return RQ_END_IO_NONE; } static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; } /* * Insert a PREFLUSH/FUA request into the flush state machine. * Returns true if the request has been consumed by the flush state machine, * or false if the caller should continue to process it. */ bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; /* * Check which flushes we need to sequence for this operation. */ if (blk_queue_write_cache(q)) { if (rq->cmd_flags & REQ_PREFLUSH) policy |= REQ_FSEQ_PREFLUSH; if ((rq->cmd_flags & REQ_FUA) && !supports_fua) policy |= REQ_FSEQ_POSTFLUSH; } /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_PREFLUSH; if (!supports_fua) rq->cmd_flags &= ~REQ_FUA; /* * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any * of those flags, we have to set REQ_SYNC to avoid skewing * the request accounting. */ rq->cmd_flags |= REQ_SYNC; switch (policy) { case 0: /* * An empty flush handed down from a stacking driver may * translate into nothing if the underlying device does not * advertise a write-back cache. In this case, simply * complete the request. */ blk_mq_end_request(rq, 0); return true; case REQ_FSEQ_DATA: /* * If there's data, but no flush is necessary, the request can * be processed directly without going through flush machinery. * Queue for normal execution. */ return false; case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: /* * Initialize the flush fields and completion handler to trigger * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: /* * Mark the request as part of a flush sequence and submit it * for further processing to the flush state machine. */ blk_rq_init_flush(rq); spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); return true; } } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for * * Description: * Issue a flush for the block device in question. */ int blkdev_issue_flush(struct block_device *bdev) { struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); fq = kzalloc_node(sizeof(*fq), flags, node); if (!fq) goto fail; spin_lock_init(&fq->mq_flush_lock); rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); fq->flush_rq = kzalloc_node(rq_sz, flags, node); if (!fq->flush_rq) goto fail_rq; INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); return fq; fail_rq: kfree(fq); fail: return NULL; } void blk_free_flush_queue(struct blk_flush_queue *fq) { /* bio based request queue hasn't flush queue */ if (!fq) return; kfree(fq->flush_rq); kfree(fq); } /* * Allow driver to set its own lock class to fq->mq_flush_lock for * avoiding lockdep complaint. * * flush_end_io() may be called recursively from some driver, such as * nvme-loop, so lockdep may complain 'possible recursive locking' because * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class * key. We need to assign different lock class for these driver's * fq->mq_flush_lock for avoiding the lockdep warning. * * Use dynamically allocated lock class key for each 'blk_flush_queue' * instance is over-kill, and more worse it introduces horrible boot delay * issue because synchronize_rcu() is implied in lockdep_unregister_key which * is called for each hctx release. SCSI probing may synchronously create and * destroy lots of MQ request_queues for non-existent devices, and some robot * test kernel always enable lockdep option. It is observed that more than half * an hour is taken during SCSI MQ probe with per-fq lock class. */ void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key) { lockdep_set_class(&hctx->fq->mq_flush_lock, key); } EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); |
2 1 1 3 1 2 2 1 2 1 1 3 2 1 4 4 3 3 3 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; enum ethtool_mac_stats_src src; }; #define PAUSE_REQINFO(__req_base) \ container_of(__req_base, struct pause_req_info, base) struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_PAUSE_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int pause_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct pause_req_info *req_info = PAUSE_REQINFO(req_base); if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { NL_SET_ERR_MSG_MOD(extack, "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); return -EINVAL; } src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); } req_info->src = src; return 0; } static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, }; static int ethnl_set_pause_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_pauseparam(dev, ¶ms); ethnl_update_bool32(¶ms.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(¶ms.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(¶ms.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_pauseparam(dev, ¶ms); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .parse_request = pause_parse_request, .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, .set_validate = ethnl_set_pause_validate, .set = ethnl_set_pause, .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, }; |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | /* * net/tipc/ib_media.c: Infiniband bearer support for TIPC * * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> * * Based on eth_media.c, which carries the following copyright notice: * * Copyright (c) 2001-2007, Ericsson AB * Copyright (c) 2005-2008, 2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/if_infiniband.h> #include "core.h" #include "bearer.h" #define TIPC_MAX_IB_LINK_WIN 500 /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) { if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */ return 1; sprintf(str_buf, "%20phC", a->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); memcpy(msg, addr->value, INFINIBAND_ALEN); return 0; } /* Convert raw InfiniBand address format to media addr format */ static int tipc_ib_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); memcpy(addr->value, msg, INFINIBAND_ALEN); addr->media_id = TIPC_MEDIA_TYPE_IB; addr->broadcast = !memcmp(msg, b->bcast_addr.value, INFINIBAND_ALEN); return 0; } /* Convert discovery msg addr format to InfiniBand media addr format */ static int tipc_ib_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { return tipc_ib_raw2addr(b, addr, msg); } /* InfiniBand media registration info */ struct tipc_media ib_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_ib_addr2str, .addr2msg = tipc_ib_addr2msg, .msg2addr = tipc_ib_msg2addr, .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" }; |
6 6 6 6 6 6 7 1 5 1 1 1 4 4 25 25 1 17 1 1 8 22 21 22 21 1 1 21 21 17 8 2 10 5 6 1 1 12 28 17 11 28 28 26 22 11 12 22 7 6 1 22 22 3 19 1 1 1 1 1 1 2 2 2 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 | // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/stddef.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/can-ml.h> #include <linux/ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> #include "af_can.h" MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, " "Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS_NETPROTO(PF_CAN); static int stats_timer __read_mostly = 1; module_param(stats_timer, int, 0444); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ static const struct can_proto __rcu *proto_tab[CAN_NPROTO] __read_mostly; static DEFINE_MUTEX(proto_tab_lock); static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { const struct can_proto *cp; rcu_read_lock(); cp = rcu_dereference(proto_tab[protocol]); if (cp && !try_module_get(cp->prot->owner)) cp = NULL; rcu_read_unlock(); return cp; } static inline void can_put_proto(const struct can_proto *cp) { module_put(cp->prot->owner); } static int can_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; const struct can_proto *cp; int err = 0; sock->state = SS_UNCONNECTED; if (protocol < 0 || protocol >= CAN_NPROTO) return -EINVAL; cp = can_get_proto(protocol); #ifdef CONFIG_MODULES if (!cp) { /* try to load protocol module if kernel is modular */ err = request_module("can-proto-%d", protocol); /* In case of error we only print a message but don't * return the error code immediately. Below we will * return -EPROTONOSUPPORT */ if (err) pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n", protocol); cp = can_get_proto(protocol); } #endif /* check for available protocol and correct usage */ if (!cp) return -EPROTONOSUPPORT; if (cp->type != sock->type) { err = -EPROTOTYPE; goto errout; } sock->ops = cp->ops; sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern); if (!sk) { err = -ENOMEM; goto errout; } sock_init_data(sock, sk); sk->sk_destruct = can_sock_destruct; if (sk->sk_prot->init) err = sk->sk_prot->init(sk); if (err) { /* release sk on errors */ sock_orphan(sk); sock_put(sk); sock->sk = NULL; } errout: can_put_proto(cp); return err; } /* af_can tx path */ /** * can_send - transmit a CAN frame (optional with local loopback) * @skb: pointer to socket buffer with CAN frame in data section * @loop: loopback for listeners on local CAN sockets (recommended default!) * * Due to the loopback this routine must not be called from hardirq context. * * Return: * 0 on success * -ENETDOWN when the selected interface is down * -ENOBUFS on full driver queue (see net_xmit_errno()) * -ENOMEM when local loopback failed at calling skb_clone() * -EPERM when trying to send on a non-CAN interface * -EMSGSIZE CAN frame size is bigger than CAN interface MTU * -EINVAL when the skb->data does not contain a valid CAN frame */ int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats; int err = -EINVAL; if (can_is_canxl_skb(skb)) { skb->protocol = htons(ETH_P_CANXL); } else if (can_is_can_skb(skb)) { skb->protocol = htons(ETH_P_CAN); } else if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; skb->protocol = htons(ETH_P_CANFD); /* set CAN FD flag for CAN FD frames by default */ cfd->flags |= CANFD_FDF; } else { goto inval_skb; } /* Make sure the CAN frame can pass the selected CAN netdevice. */ if (unlikely(skb->len > skb->dev->mtu)) { err = -EMSGSIZE; goto inval_skb; } if (unlikely(skb->dev->type != ARPHRD_CAN)) { err = -EPERM; goto inval_skb; } if (unlikely(!(skb->dev->flags & IFF_UP))) { err = -ENETDOWN; goto inval_skb; } skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); if (loop) { /* local loopback of sent CAN frames */ /* indication for the CAN driver: do loopback */ skb->pkt_type = PACKET_LOOPBACK; /* The reference to the originating sock may be required * by the receiving socket to check whether the frame is * its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS * Therefore we have to ensure that skb->sk remains the * reference to the originating sock by restoring skb->sk * after each skb_clone() or skb_orphan() usage. */ if (!(skb->dev->flags & IFF_ECHO)) { /* If the interface is not capable to do loopback * itself, we do it here. */ newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) { kfree_skb(skb); return -ENOMEM; } can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } } else { /* indication for the CAN driver: no loopback required */ skb->pkt_type = PACKET_HOST; } /* send to netdevice */ err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); if (err) { kfree_skb(newskb); return err; } if (newskb) netif_rx(newskb); /* update statistics */ pkg_stats->tx_frames++; pkg_stats->tx_frames_delta++; return 0; inval_skb: kfree_skb(skb); return err; } EXPORT_SYMBOL(can_send); /* af_can rx path */ static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net, struct net_device *dev) { if (dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); return &can_ml->dev_rcv_lists; } else { return net->can.rx_alldev_list; } } /** * effhash - hash function for 29 bit CAN identifier reduction * @can_id: 29 bit CAN identifier * * Description: * To reduce the linear traversal in one linked list of _single_ EFF CAN * frame subscriptions the 29 bit identifier is mapped to 10 bits. * (see CAN_EFF_RCV_HASH_BITS definition) * * Return: * Hash value from 0x000 - 0x3FF ( enforced by CAN_EFF_RCV_HASH_BITS mask ) */ static unsigned int effhash(canid_t can_id) { unsigned int hash; hash = can_id; hash ^= can_id >> CAN_EFF_RCV_HASH_BITS; hash ^= can_id >> (2 * CAN_EFF_RCV_HASH_BITS); return hash & ((1 << CAN_EFF_RCV_HASH_BITS) - 1); } /** * can_rcv_list_find - determine optimal filterlist inside device filter struct * @can_id: pointer to CAN identifier of a given can_filter * @mask: pointer to CAN mask of a given can_filter * @dev_rcv_lists: pointer to the device filter struct * * Description: * Returns the optimal filterlist to reduce the filter handling in the * receive path. This function is called by service functions that need * to register or unregister a can_filter in the filter lists. * * A filter matches in general, when * * <received_can_id> & mask == can_id & mask * * so every bit set in the mask (even CAN_EFF_FLAG, CAN_RTR_FLAG) describe * relevant bits for the filter. * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error messages (CAN_ERR_FLAG bit set in mask). For error msg * frames there is a special filterlist and a special rx path filter handling. * * Return: * Pointer to optimal filterlist for the given can_id/mask pair. * Consistency checked mask. * Reduced can_id to have a preprocessed filter compare value. */ static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask, struct can_dev_rcv_lists *dev_rcv_lists) { canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */ /* filter for error message frames in extra filterlist */ if (*mask & CAN_ERR_FLAG) { /* clear CAN_ERR_FLAG in filter entry */ *mask &= CAN_ERR_MASK; return &dev_rcv_lists->rx[RX_ERR]; } /* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */ #define CAN_EFF_RTR_FLAGS (CAN_EFF_FLAG | CAN_RTR_FLAG) /* ensure valid values in can_mask for 'SFF only' frame filtering */ if ((*mask & CAN_EFF_FLAG) && !(*can_id & CAN_EFF_FLAG)) *mask &= (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS); /* reduce condition testing at receive time */ *can_id &= *mask; /* inverse can_id/can_mask filter */ if (inv) return &dev_rcv_lists->rx[RX_INV]; /* mask == 0 => no condition testing at receive time */ if (!(*mask)) return &dev_rcv_lists->rx[RX_ALL]; /* extra filterlists for the subscription of a single non-RTR can_id */ if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) && !(*can_id & CAN_RTR_FLAG)) { if (*can_id & CAN_EFF_FLAG) { if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_eff[effhash(*can_id)]; } else { if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS)) return &dev_rcv_lists->rx_sff[*can_id]; } } /* default: filter via can_id/can_mask */ return &dev_rcv_lists->rx[RX_FIL]; } /** * can_rx_register - subscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list) * @can_id: CAN identifier (see description) * @mask: CAN mask (see description) * @func: callback function on filter match * @data: returned parameter for callback function * @ident: string for calling module identification * @sk: socket pointer (might be NULL) * * Description: * Invokes the callback function with the received sk_buff and the given * parameter 'data' on a matching receive filter. A filter matches, when * * <received_can_id> & mask == can_id & mask * * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can * filter for error message frames (CAN_ERR_FLAG bit set in mask). * * The provided pointer to the sk_buff is guaranteed to be valid as long as * the callback function is running. The callback function must *not* free * the given sk_buff while processing it's task. When the given sk_buff is * needed after the end of the callback function it must be cloned inside * the callback function with skb_clone(). * * Return: * 0 on success * -ENOMEM on missing cache mem to create subscription entry * -ENODEV unknown device */ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, char *ident, struct sock *sk) { struct receiver *rcv; struct hlist_head *rcv_list; struct can_dev_rcv_lists *dev_rcv_lists; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; /* insert new receiver (dev,canid,mask) -> (func,data) */ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev))) return -ENODEV; if (dev && !net_eq(net, dev_net(dev))) return -ENODEV; rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL); if (!rcv) return -ENOMEM; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); rcv->can_id = can_id; rcv->mask = mask; rcv->matches = 0; rcv->func = func; rcv->data = data; rcv->ident = ident; rcv->sk = sk; hlist_add_head_rcu(&rcv->list, rcv_list); dev_rcv_lists->entries++; rcv_lists_stats->rcv_entries++; rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max, rcv_lists_stats->rcv_entries); spin_unlock_bh(&net->can.rcvlists_lock); return 0; } EXPORT_SYMBOL(can_rx_register); /* can_rx_delete_receiver - rcu callback for single receiver entry removal */ static void can_rx_delete_receiver(struct rcu_head *rp) { struct receiver *rcv = container_of(rp, struct receiver, rcu); struct sock *sk = rcv->sk; kmem_cache_free(rcv_cache, rcv); if (sk) sock_put(sk); } /** * can_rx_unregister - unsubscribe CAN frames from a specific interface * @net: the applicable net namespace * @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list) * @can_id: CAN identifier * @mask: CAN mask * @func: callback function on filter match * @data: returned parameter for callback function * * Description: * Removes subscription entry depending on given (subscription) values. */ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data) { struct receiver *rcv = NULL; struct hlist_head *rcv_list; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; struct can_dev_rcv_lists *dev_rcv_lists; if (dev && dev->type != ARPHRD_CAN) return; if (dev && !net_eq(net, dev_net(dev))) return; spin_lock_bh(&net->can.rcvlists_lock); dev_rcv_lists = can_dev_rcv_lists_find(net, dev); rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists); /* Search the receiver list for the item to delete. This should * exist, since no receiver may be unregistered that hasn't * been registered before. */ hlist_for_each_entry_rcu(rcv, rcv_list, list) { if (rcv->can_id == can_id && rcv->mask == mask && rcv->func == func && rcv->data == data) break; } /* Check for bugs in CAN protocol implementations using af_can.c: * 'rcv' will be NULL if no matching list item was found for removal. * As this case may potentially happen when closing a socket while * the notifier for removing the CAN netdev is running we just print * a warning here. */ if (!rcv) { pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n", DNAME(dev), can_id, mask); goto out; } hlist_del_rcu(&rcv->list); dev_rcv_lists->entries--; if (rcv_lists_stats->rcv_entries > 0) rcv_lists_stats->rcv_entries--; out: spin_unlock_bh(&net->can.rcvlists_lock); /* schedule the receiver item for deletion */ if (rcv) { if (rcv->sk) sock_hold(rcv->sk); call_rcu(&rcv->rcu, can_rx_delete_receiver); } } EXPORT_SYMBOL(can_rx_unregister); static inline void deliver(struct sk_buff *skb, struct receiver *rcv) { rcv->func(skb, rcv->data); rcv->matches++; } static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb) { struct receiver *rcv; int matches = 0; struct can_frame *cf = (struct can_frame *)skb->data; canid_t can_id = cf->can_id; if (dev_rcv_lists->entries == 0) return 0; if (can_id & CAN_ERR_FLAG) { /* check for error message frame entries only */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) { if (can_id & rcv->mask) { deliver(skb, rcv); matches++; } } return matches; } /* check for unfiltered entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) { deliver(skb, rcv); matches++; } /* check for can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) { if ((can_id & rcv->mask) == rcv->can_id) { deliver(skb, rcv); matches++; } } /* check for inverted can_id/mask entries */ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) { if ((can_id & rcv->mask) != rcv->can_id) { deliver(skb, rcv); matches++; } } /* check filterlists for single non-RTR can_ids */ if (can_id & CAN_RTR_FLAG) return matches; if (can_id & CAN_EFF_FLAG) { hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) { if (rcv->can_id == can_id) { deliver(skb, rcv); matches++; } } } else { can_id &= CAN_SFF_MASK; hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) { deliver(skb, rcv); matches++; } } return matches; } static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = dev_net(dev); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; int matches; /* update statistics */ pkg_stats->rx_frames++; pkg_stats->rx_frames_delta++; /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ matches = can_rcv_filter(net->can.rx_alldev_list, skb); /* find receive list for this device */ dev_rcv_lists = can_dev_rcv_lists_find(net, dev); matches += can_rcv_filter(dev_rcv_lists, skb); rcu_read_unlock(); /* consume the skbuff allocated by the netdevice driver */ consume_skb(skb); if (matches > 0) { pkg_stats->matches++; pkg_stats->matches_delta++; } } static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); kfree_skb(skb); return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; } /* af_can protocol functions */ /** * can_proto_register - register CAN transport protocol * @cp: pointer to CAN protocol structure * * Return: * 0 on success * -EINVAL invalid (out of range) protocol number * -EBUSY protocol already in use * -ENOBUF if proto_register() fails */ int can_proto_register(const struct can_proto *cp) { int proto = cp->protocol; int err = 0; if (proto < 0 || proto >= CAN_NPROTO) { pr_err("can: protocol number %d out of range\n", proto); return -EINVAL; } err = proto_register(cp->prot, 0); if (err < 0) return err; mutex_lock(&proto_tab_lock); if (rcu_access_pointer(proto_tab[proto])) { pr_err("can: protocol %d already registered\n", proto); err = -EBUSY; } else { RCU_INIT_POINTER(proto_tab[proto], cp); } mutex_unlock(&proto_tab_lock); if (err < 0) proto_unregister(cp->prot); return err; } EXPORT_SYMBOL(can_proto_register); /** * can_proto_unregister - unregister CAN transport protocol * @cp: pointer to CAN protocol structure */ void can_proto_unregister(const struct can_proto *cp) { int proto = cp->protocol; mutex_lock(&proto_tab_lock); BUG_ON(rcu_access_pointer(proto_tab[proto]) != cp); RCU_INIT_POINTER(proto_tab[proto], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); proto_unregister(cp->prot); } EXPORT_SYMBOL(can_proto_unregister); static int can_pernet_init(struct net *net) { spin_lock_init(&net->can.rcvlists_lock); net->can.rx_alldev_list = kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL); if (!net->can.rx_alldev_list) goto out; net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL); if (!net->can.pkg_stats) goto out_free_rx_alldev_list; net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL); if (!net->can.rcv_lists_stats) goto out_free_pkg_stats; if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { timer_setup(&net->can.stattimer, can_stat_update, 0); mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } net->can.pkg_stats->jiffies_init = jiffies; can_init_proc(net); } return 0; out_free_pkg_stats: kfree(net->can.pkg_stats); out_free_rx_alldev_list: kfree(net->can.rx_alldev_list); out: return -ENOMEM; } static void can_pernet_exit(struct net *net) { if (IS_ENABLED(CONFIG_PROC_FS)) { can_remove_proc(net); if (stats_timer) del_timer_sync(&net->can.stattimer); } kfree(net->can.rx_alldev_list); kfree(net->can.pkg_stats); kfree(net->can.rcv_lists_stats); } /* af_can module init/exit functions */ static struct packet_type can_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CAN), .func = can_rcv, }; static struct packet_type canfd_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANFD), .func = canfd_rcv, }; static struct packet_type canxl_packet __read_mostly = { .type = cpu_to_be16(ETH_P_CANXL), .func = canxl_rcv, }; static const struct net_proto_family can_family_ops = { .family = PF_CAN, .create = can_create, .owner = THIS_MODULE, }; static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, }; static __init int can_init(void) { int err; /* check for correct padding to be able to use the structs similarly */ BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || offsetof(struct can_frame, len) != offsetof(struct canxl_frame, flags) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); pr_info("can: controller area network core\n"); rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), 0, 0, NULL); if (!rcv_cache) return -ENOMEM; err = register_pernet_subsys(&can_pernet_ops); if (err) goto out_pernet; /* protocol register */ err = sock_register(&can_family_ops); if (err) goto out_sock; dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); dev_add_pack(&canxl_packet); return 0; out_sock: unregister_pernet_subsys(&can_pernet_ops); out_pernet: kmem_cache_destroy(rcv_cache); return err; } static __exit void can_exit(void) { /* protocol unregister */ dev_remove_pack(&canxl_packet); dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); sock_unregister(PF_CAN); unregister_pernet_subsys(&can_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(rcv_cache); } module_init(can_init); module_exit(can_exit); |
8956 781 8374 12 19 150 7600 127 16 138 34 34 34 15 19 4 4 4 1 3 4 84 83 84 84 117 115 1 116 117 4 3 4 4 21 8781 1 8531 8582 7655 7657 151 7502 7708 7653 97 8742 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * * (C) 2002-2003 Nadia Yvette Chambers, IBM * (C) 2004 Nadia Yvette Chambers, Oracle * (C) 2002-2004 Ingo Molnar, Red Hat * * pid-structures are backing objects for tasks sharing a given ID to chain * against. There is very little to them aside from hashing them and * parking tasks using given ID's on a list. * * The hash is always changed with the tasklist_lock write-acquired, * and the hash is only accessed with the tasklist_lock at least * read-acquired, so there's no additional SMP locking needed here. * * We have a list of bitmap pages, which bitmaps represent the PID space. * Allocating and freeing PIDs is completely lockless. The worst-case * allocation scenario when all but one out of 1 million PIDs possible are * allocated already: the scanning of 32 list entries and at most PAGE_SIZE * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). * * Pid namespaces: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/anon_inodes.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> #include <linux/pidfs.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, { .first = NULL }, }, .level = 0, .numbers = { { .nr = 0, .ns = &init_pid_ns, }, } }; int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; /* * Pseudo filesystems start inode numbering after one. We use Reserved * PIDs as a natural offset. */ static u64 pidfs_ino = RESERVED_PIDS; /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { .ns.count = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif }; EXPORT_SYMBOL_GPL(init_pid_ns); /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). * * If we don't disable interrupts there is a nasty deadlock between * detach_pid()->free_pid() and another cpu that does * spin_lock(&pidmap_lock) followed by an interrupt routine that does * read_lock(&tasklist_lock); * * After we clean up the tasklist_lock and know there are no * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); void put_pid(struct pid *pid) { struct pid_namespace *ns; if (!pid) return; ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } } EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { struct pid *pid = container_of(rhp, struct pid, rcu); put_pid(pid); } void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper * may be sleeping in zap_pid_ns_processes(). */ wake_up_process(ns->child_reaper); break; case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; break; } idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); } struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; /* * set_tid_size contains the size of the set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace * up to set_tid_size PID namespaces. It does not have to set the PID * for a process in all nested PID namespaces but set_tid_size must * never be greater than the current ns->level + 1. */ if (set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { int tid = 0; if (set_tid_size) { tid = set_tid[ns->level - i]; retval = -EINVAL; if (tid < 1 || tid >= pid_max) goto out_free; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* * If ENOSPC is returned it means that the PID is * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) nr = -EEXIST; } else { int pid_min = 1; /* * init really needs pid 1, but after reaching the * maximum wrap back to RESERVED_PIDS */ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) pid_min = RESERVED_PIDS; /* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } spin_unlock_irq(&pidmap_lock); idr_preload_end(); if (nr < 0) { retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } /* * ENOMEM is not the most obvious choice especially for the case * where the child subreaper has already exited and the pid * namespace denies the creation of any new processes. But ENOMEM * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. */ retval = -ENOMEM; get_pid_ns(ns); refcount_set(&pid->count, 1); spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pid->stashed = NULL; pid->ino = ++pidfs_ino; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); put_pid_ns(ns); out_free: spin_lock_irq(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); } /* On failure to allocate the first pid, reset the state */ if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) { return (type == PIDTYPE_PID) ? &task->thread_pid : &task->signal->pids[type]; } /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { struct pid *pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; if (type == PIDTYPE_PID) { WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); wake_up_all(&pid->wait_pidfd); } for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; free_pid(pid); } void detach_pid(struct task_struct *task, enum pid_type type) { __change_pid(task, type, NULL); } void change_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { __change_pid(task, type, pid); attach_pid(task, type); } void exchange_tids(struct task_struct *left, struct task_struct *right) { struct pid *pid1 = left->thread_pid; struct pid *pid2 = right->thread_pid; struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); /* Swap the per task_struct pid */ rcu_assign_pointer(left->thread_pid, pid2); rcu_assign_pointer(right->thread_pid, pid1); /* Swap the cached value */ WRITE_ONCE(left->pid, pid_nr(pid2)); WRITE_ONCE(right->pid, pid_nr(pid1)); } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } EXPORT_SYMBOL(pid_task); /* * Must be called under rcu_read_lock(). */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } struct task_struct *find_task_by_vpid(pid_t vnr) { return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct task_struct *find_get_task_by_vpid(pid_t nr) { struct task_struct *task; rcu_read_lock(); task = find_task_by_vpid(nr); if (task) get_task_struct(task); rcu_read_unlock(); return task; } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; rcu_read_lock(); result = pid_task(pid, type); if (result) get_task_struct(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { struct pid *pid; rcu_read_lock(); pid = get_pid(find_vpid(nr)); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { struct upid *upid; pid_t nr = 0; if (pid && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; } return nr; } EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns) { pid_t nr = 0; rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; } EXPORT_SYMBOL(__task_pid_nr_ns); struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); } EXPORT_SYMBOL_GPL(task_active_pid_ns); /* * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { CLASS(fd, f)(fd); struct pid *pid; if (fd_empty(f)) return ERR_PTR(-EBADF); pid = pidfd_pid(fd_file(f)); if (!IS_ERR(pid)) { get_pid(pid); *flags = fd_file(f)->f_flags; } return pid; } /** * pidfd_get_task() - Get the task associated with a pidfd * * @pidfd: pidfd for which to get the task * @flags: flags associated with this pidfd * * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { unsigned int f_flags; struct pid *pid; struct task_struct *task; pid = pidfd_get_pid(pidfd, &f_flags); if (IS_ERR(pid)) return ERR_CAST(pid); task = get_pid_task(pid, PIDTYPE_TGID); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); *flags = f_flags; return task; } /** * pidfd_create() - Create a new pid file descriptor. * * @pid: struct pid that the pidfd will reference * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * * This symbol should not be explicitly exported to loadable modules. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ static int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; pidfd = pidfd_prepare(pid, flags, &pidfd_file); if (pidfd < 0) return pidfd; fd_install(pidfd, pidfd_file); return pidfd; } /** * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for * the task identified by @pid. Without PIDFD_THREAD flag the target task * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { int fd; struct pid *p; if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) return -EINVAL; p = find_get_pid(pid); if (!p) return -ESRCH; fd = pidfd_create(p, flags); put_pid(p); return fd; } void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = kmem_cache_create("pid", struct_size_t(struct pid, numbers, 1), __alignof__(struct pid), SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); } static struct file *__pidfd_fget(struct task_struct *task, int fd) { struct file *file; int ret; ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) file = fget_task(task, fd); else file = ERR_PTR(-EPERM); up_read(&task->signal->exec_update_lock); if (!file) { /* * It is possible that the target thread is exiting; it can be * either: * 1. before exit_signals(), which gives a real fd * 2. before exit_files() takes the task_lock() gives a real fd * 3. after exit_files() releases task_lock(), ->files is NULL; * this has PF_EXITING, since it was set in exit_signals(), * __pidfd_fget() returns EBADF. * In case 3 we get EBADF, but that really means ESRCH, since * the task is currently exiting and has freed its files * struct, so we fix it up. */ if (task->flags & PF_EXITING) file = ERR_PTR(-ESRCH); else file = ERR_PTR(-EBADF); } return file; } static int pidfd_getfd(struct pid *pid, int fd) { struct task_struct *task; struct file *file; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; file = __pidfd_fget(task, fd); put_task_struct(task); if (IS_ERR(file)) return PTR_ERR(file); ret = receive_fd(file, NULL, O_CLOEXEC); fput(file); return ret; } /** * sys_pidfd_getfd() - Get a file descriptor from another process * * @pidfd: the pidfd file descriptor of the process * @fd: the file descriptor number to get * @flags: flags on how to get the fd (reserved) * * This syscall gets a copy of a file descriptor from another process * based on the pidfd, and file descriptor number. It requires that * the calling process has the ability to ptrace the process represented * by the pidfd. The process which is having its file descriptor copied * is otherwise unaffected. * * Return: On success, a cloexec file descriptor is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; CLASS(fd, f)(pidfd); if (fd_empty(f)) return -EBADF; pid = pidfd_pid(fd_file(f)); if (IS_ERR(pid)) return PTR_ERR(pid); return pidfd_getfd(pid, fd); } |
209 20746 642 21000 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_FILE_REF_H #define _LINUX_FILE_REF_H #include <linux/atomic.h> #include <linux/preempt.h> #include <linux/types.h> /* * file_ref is a reference count implementation specifically for use by * files. It takes inspiration from rcuref but differs in key aspects * such as support for SLAB_TYPESAFE_BY_RCU type caches. * * FILE_REF_ONEREF FILE_REF_MAXREF * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL * <-------------------valid -------------------> * * FILE_REF_SATURATED * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL * <-----------------------saturation zone----------------------> * * FILE_REF_RELEASED FILE_REF_DEAD * 0xC000000000000000UL 0xE000000000000000UL * <-------------------dead zone-------------------> * * FILE_REF_NOREF * 0xFFFFFFFFFFFFFFFFUL */ #ifdef CONFIG_64BIT #define FILE_REF_ONEREF 0x0000000000000000UL #define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL #define FILE_REF_SATURATED 0xA000000000000000UL #define FILE_REF_RELEASED 0xC000000000000000UL #define FILE_REF_DEAD 0xE000000000000000UL #define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL #else #define FILE_REF_ONEREF 0x00000000U #define FILE_REF_MAXREF 0x7FFFFFFFU #define FILE_REF_SATURATED 0xA0000000U #define FILE_REF_RELEASED 0xC0000000U #define FILE_REF_DEAD 0xE0000000U #define FILE_REF_NOREF 0xFFFFFFFFU #endif typedef struct { #ifdef CONFIG_64BIT atomic64_t refcnt; #else atomic_t refcnt; #endif } file_ref_t; /** * file_ref_init - Initialize a file reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) { atomic_long_set(&ref->refcnt, cnt - 1); } bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** * file_ref_get - Acquire one reference on a file * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. * * Provides full memory ordering. * * Return: False if the attempt to acquire a reference failed. This happens * when the last reference has been put already. True if a reference * was successfully acquired */ static __always_inline __must_check bool file_ref_get(file_ref_t *ref) { /* * Unconditionally increase the reference count with full * ordering. The saturation and dead zones provide enough * tolerance for this. * * If this indicates negative the file in question the fail can * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. * Hence, unconditionally altering the file reference count to * e.g., reset the file reference count back to the middle of * the deadzone risk end up marking someone else's file as dead * behind their back. * * It would be possible to do a careful: * * cnt = atomic_long_inc_return(); * if (likely(cnt >= 0)) * return true; * * and then something like: * * if (cnt >= FILE_REF_RELEASE) * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), * * to set the value back to the middle of the deadzone. But it's * practically impossible to go from FILE_REF_DEAD to * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 * file_ref_get()s to resurrect such a dead file. */ return !atomic_long_add_negative(1, &ref->refcnt); } /** * file_ref_inc - Acquire one reference on a file * @ref: Pointer to the reference count * * Acquire an additional reference on a file. Warns if the caller didn't * already hold a reference. */ static __always_inline void file_ref_inc(file_ref_t *ref) { long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); } /** * file_ref_put -- Release a file reference * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores * are done before, and provides an acquire ordering on success such * that free() must come after. * * Return: True if this was the last reference with no future references * possible. This signals the caller that it can safely release * the object which is protected by the reference counter. * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) { long cnt; /* * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() * calls don't risk UAFs when a file is recyclyed, it is still * vulnerable to UAFs caused by freeing the whole slab page once * it becomes unused. Prevent file_ref_put() from being * preempted protects against this. */ guard(preempt)(); /* * Unconditionally decrease the reference count. The saturation * and dead zones provide enough tolerance for this. If this * fails then we need to handle the last reference drop and * cases inside the saturation and dead zones. */ cnt = atomic_long_dec_return(&ref->refcnt); if (cnt >= 0) return false; return __file_ref_put(ref, cnt); } /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned long file_ref_read(file_ref_t *ref) { unsigned long c = atomic_long_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= FILE_REF_RELEASED ? 0 : c + 1; } #endif |
1 1 4 589 595 1 4 4 4 5 3 1 4 4 2 2 4 4 1 3 2 1 3 2 2 2 2 2 1 1 3 1 1 1 1 1 1 19 15 1 1 1 1 1 3 1 2 5 4 1 4 1 8 3 5 13 9 3 13 78 62 13 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/fcntl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/file.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/memfd.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/rw_hint.h> #include <linux/poll.h> #include <asm/siginfo.h> #include <linux/uaccess.h> #include "internal.h" #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; /* * O_APPEND cannot be cleared if the file is marked as append-only * and the file is open for write. */ if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) if (!inode_owner_or_capable(file_mnt_idmap(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ if (O_NONBLOCK != O_NDELAY) if (arg & O_NDELAY) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT) && !(filp->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); if (error) return error; /* * ->fasync() is responsible for setting the FASYNC bit. */ if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) { error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0); if (error < 0) goto out; if (error > 0) error = 0; } spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); filp->f_iocb_flags = iocb_flags(filp); spin_unlock(&filp->f_lock); out: return error; } /* * Allocate an file->f_owner struct if it doesn't exist, handling racing * allocations correctly. */ int file_f_owner_allocate(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) return 0; f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); if (!f_owner) return -ENOMEM; rwlock_init(&f_owner->lock); f_owner->file = file; /* If someone else raced us, drop our allocation. */ if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) kfree(f_owner); return 0; } EXPORT_SYMBOL(file_f_owner_allocate); void file_f_owner_release(struct file *file) { struct fown_struct *f_owner; f_owner = file_f_owner(file); if (f_owner) { put_pid(f_owner->pid); kfree(f_owner); } } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (WARN_ON_ONCE(!f_owner)) return; write_lock_irq(&f_owner->lock); if (force || !f_owner->pid) { put_pid(f_owner->pid); f_owner->pid = get_pid(pid); f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); security_file_set_fowner(filp); f_owner->uid = cred->uid; f_owner->euid = cred->euid; } } write_unlock_irq(&f_owner->lock); } EXPORT_SYMBOL(__f_setown); int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; int ret = 0; might_sleep(); type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ if (who == INT_MIN) return -EINVAL; type = PIDTYPE_PGID; who = -who; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); if (who) { pid = find_vpid(who); if (!pid) ret = -ESRCH; } if (!ret) __f_setown(filp, pid, type, force); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { __f_setown(filp, NULL, PIDTYPE_TGID, 1); } pid_t f_getown(struct file *filp) { pid_t pid = 0; struct fown_struct *f_owner; f_owner = file_f_owner(filp); if (!f_owner) return pid; read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) { pid = pid_vnr(f_owner->pid); if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); read_unlock_irq(&f_owner->lock); return pid; } static int f_setown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner; struct pid *pid; int type; int ret; ret = copy_from_user(&owner, owner_p, sizeof(owner)); if (ret) return -EFAULT; switch (owner.type) { case F_OWNER_TID: type = PIDTYPE_PID; break; case F_OWNER_PID: type = PIDTYPE_TGID; break; case F_OWNER_PGRP: type = PIDTYPE_PGID; break; default: return -EINVAL; } ret = file_f_owner_allocate(filp); if (ret) return ret; rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) ret = -ESRCH; else __f_setown(filp, pid, type, 1); rcu_read_unlock(); return ret; } static int f_getown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; struct fown_struct *f_owner; enum pid_type pid_type = PIDTYPE_PID; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); rcu_read_lock(); if (pid_task(f_owner->pid, f_owner->pid_type)) owner.pid = pid_vnr(f_owner->pid); rcu_read_unlock(); pid_type = f_owner->pid_type; } switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; case PIDTYPE_TGID: owner.type = F_OWNER_PID; break; case PIDTYPE_PGID: owner.type = F_OWNER_PGRP; break; default: WARN_ON(1); ret = -EINVAL; break; } if (f_owner) read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); if (ret) ret = -EFAULT; } return ret; } #ifdef CONFIG_CHECKPOINT_RESTORE static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; uid_t src[2] = {0, 0}; int err; f_owner = file_f_owner(filp); if (f_owner) { read_lock_irq(&f_owner->lock); src[0] = from_kuid(user_ns, f_owner->uid); src[1] = from_kuid(user_ns, f_owner->euid); read_unlock_irq(&f_owner->lock); } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); return err; } #else static int f_getowner_uids(struct file *filp, unsigned long arg) { return -EINVAL; } #endif static bool rw_hint_valid(u64 hint) { BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: case RWH_WRITE_LIFE_SHORT: case RWH_WRITE_LIFE_MEDIUM: case RWH_WRITE_LIFE_LONG: case RWH_WRITE_LIFE_EXTREME: return true; default: return false; } } static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint = READ_ONCE(inode->i_write_hint); if (copy_to_user(argp, &hint, sizeof(*argp))) return -EFAULT; return 0; } static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; u64 hint; if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; if (copy_from_user(&hint, argp, sizeof(hint))) return -EFAULT; if (!rw_hint_valid(hint)) return -EINVAL; WRITE_ONCE(inode->i_write_hint, hint); /* * file->f_mapping->host may differ from inode. As an example, * blkdev_open() modifies file->f_mapping. */ if (file->f_mapping->host != inode) WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); return 0; } /* Is the file descriptor a dup of the file? */ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); if (fd_empty(f)) return -EBADF; /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. * * Technically we didn't need a ref at all, and 'fdget()' was * overkill, but given our lockless file pointer lookup, the * alternatives are complicated. */ return fd_file(f) == filp; } /* Let the caller figure out whether a given file was just created. */ static long f_created_query(const struct file *filp) { return !!(filp->f_mode & FMODE_CREATED); } static int f_owner_sig(struct file *filp, int signum, bool setsig) { int ret = 0; struct fown_struct *f_owner; might_sleep(); if (setsig) { if (!valid_signal(signum)) return -EINVAL; ret = file_f_owner_allocate(filp); if (ret) return ret; } f_owner = file_f_owner(filp); if (setsig) f_owner->signum = signum; else if (f_owner) ret = f_owner->signum; return ret; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_CREATED_QUERY: err = f_created_query(filp); break; case F_DUPFD: err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_DUPFD_QUERY: err = f_dupfd_query(argi, filp); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_GETLK: #endif case F_GETLK: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_getlk(filp, cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ case F_OFD_SETLK: case F_OFD_SETLKW: fallthrough; #endif case F_SETLK: case F_SETLKW: if (copy_from_user(&flock, argp, sizeof(flock))) return -EFAULT; err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* * XXX If f_owner is a process group, the * negative return value will get converted * into an error. Oops. If we keep the * current syscall conventions, the only way * to fix this will be in libc. */ err = f_getown(filp); force_successful_syscall_return(); break; case F_SETOWN: err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); break; case F_SETOWN_EX: err = f_setown_ex(filp, arg); break; case F_GETOWNER_UIDS: err = f_getowner_uids(filp, arg); break; case F_GETSIG: err = f_owner_sig(filp, 0, false); break; case F_SETSIG: err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: err = fcntl_get_rw_hint(filp, cmd, arg); break; case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, cmd, arg); break; default: break; } return err; } static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: case F_DUPFD_QUERY: case F_GETFD: case F_SETFD: case F_GETFL: return 1; } return 0; } SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { CLASS(fd_raw, f)(fd); long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (!err) err = do_fcntl(fd, cmd, arg, fd_file(f)); return err; } #if BITS_PER_LONG == 32 SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { void __user *argp = (void __user *)arg; CLASS(fd_raw, f)(fd); struct flock64 flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK64: case F_OFD_GETLK: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_getlk64(fd_file(f), cmd, &flock); if (!err && copy_to_user(argp, &flock, sizeof(flock))) err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = -EFAULT; if (copy_from_user(&flock, argp, sizeof(flock))) break; err = fcntl_setlk64(fd, fd_file(f), cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } #endif #ifdef CONFIG_COMPAT /* careful - don't use anywhere else */ #define copy_flock_fields(dst, src) \ (dst)->l_type = (src)->l_type; \ (dst)->l_whence = (src)->l_whence; \ (dst)->l_start = (src)->l_start; \ (dst)->l_len = (src)->l_len; \ (dst)->l_pid = (src)->l_pid; static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl) { struct compat_flock fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl) { struct compat_flock64 fl; if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64))) return -EFAULT; copy_flock_fields(kfl, &fl); return 0; } static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl) { struct compat_flock fl; memset(&fl, 0, sizeof(struct compat_flock)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock))) return -EFAULT; return 0; } static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl) { struct compat_flock64 fl; BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start)); BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len)); memset(&fl, 0, sizeof(struct compat_flock64)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) return -EFAULT; return 0; } #undef copy_flock_fields static unsigned int convert_fcntl_cmd(unsigned int cmd) { switch (cmd) { case F_GETLK64: return F_GETLK; case F_SETLK64: return F_SETLK; case F_SETLKW64: return F_SETLKW; } return cmd; } /* * GETLK was successful and we need to return the data, but it needs to fit in * the compat structure. * l_start shouldn't be too big, unless the original start + end is greater than * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return * -EOVERFLOW in that case. l_len could be too big, in which case we just * truncate it, and only allow the app to see that part of the conflicting lock * that might make sense to it anyway */ static int fixup_compat_flock(struct flock *flock) { if (flock->l_start > COMPAT_OFF_T_MAX) return -EOVERFLOW; if (flock->l_len > COMPAT_OFF_T_MAX) flock->l_len = COMPAT_OFF_T_MAX; return 0; } static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg) { CLASS(fd_raw, f)(fd); struct flock flock; long err; if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) { if (!check_fcntl_cmd(cmd)) return -EBADF; } err = security_file_fcntl(fd_file(f), cmd, arg); if (err) return err; switch (cmd) { case F_GETLK: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (err) break; err = fixup_compat_flock(&flock); if (!err) err = put_compat_flock(&flock, compat_ptr(arg)); break; case F_GETLK64: case F_OFD_GETLK: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock); if (!err) err = put_compat_flock64(&flock, compat_ptr(arg)); break; case F_SETLK: case F_SETLKW: err = get_compat_flock(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: err = get_compat_flock64(&flock, compat_ptr(arg)); if (err) break; err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock); break; default: err = do_fcntl(fd, cmd, arg, fd_file(f)); break; } return err; } COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { return do_compat_fcntl64(fd, cmd, arg); } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { switch (cmd) { case F_GETLK64: case F_SETLK64: case F_SETLKW64: case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: return -EINVAL; } return do_compat_fcntl64(fd, cmd, arg); } #endif /* Table to convert sigio signal codes into poll band bitmaps */ static const __poll_t band_table[NSIGPOLL] = { EPOLLIN | EPOLLRDNORM, /* POLL_IN */ EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND, /* POLL_OUT */ EPOLLIN | EPOLLRDNORM | EPOLLMSG, /* POLL_MSG */ EPOLLERR, /* POLL_ERR */ EPOLLPRI | EPOLLRDBAND, /* POLL_PRI */ EPOLLHUP | EPOLLERR /* POLL_HUP */ }; static inline int sigio_perm(struct task_struct *p, struct fown_struct *fown, int sig) { const struct cred *cred; int ret; rcu_read_lock(); cred = __task_cred(p); ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) || uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) || uid_eq(fown->uid, cred->suid) || uid_eq(fown->uid, cred->uid)) && !security_file_send_sigiotask(p, fown, sig)); rcu_read_unlock(); return ret; } static void send_sigio_to_task(struct task_struct *p, struct fown_struct *fown, int fd, int reason, enum pid_type type) { /* * F_SETSIG can change ->signum lockless in parallel, make * sure we read it once and use the same value throughout. */ int signum = READ_ONCE(fown->signum); if (!sigio_perm(p, fown, signum)) return; switch (signum) { default: { kernel_siginfo_t si; /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not SI_KERNEL, since kernel signals always get delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ clear_siginfo(&si); si.si_signo = signum; si.si_errno = 0; si.si_code = reason; /* * Posix definies POLL_IN and friends to be signal * specific si_codes for SIG_POLL. Linux extended * these si_codes to other signals in a way that is * ambiguous if other signals also have signal * specific si_codes. In that case use SI_SIGIO instead * to remove the ambiguity. */ if ((signum != SIGPOLL) && sig_specific_sicodes(signum)) si.si_code = SI_SIGIO; /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL)); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else si.si_band = mangle_poll(band_table[reason - POLL_IN]); si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, type)) break; } fallthrough; /* fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); } } void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; enum pid_type type; unsigned long flags; struct pid *pid; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigio_to_task(p, fown, fd, band, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigio_to_task(p, fown, fd, band, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); } static void send_sigurg_to_task(struct task_struct *p, struct fown_struct *fown, enum pid_type type) { if (sigio_perm(p, fown, SIGURG)) do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } int send_sigurg(struct file *file) { struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; fown = file_f_owner(file); if (!fown) return 0; read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; ret = 1; if (type <= PIDTYPE_TGID) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) send_sigurg_to_task(p, fown, type); rcu_read_unlock(); } else { read_lock(&tasklist_lock); do_each_pid_task(pid, type, p) { send_sigurg_to_task(p, fown, type); } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); } out_unlock_fown: read_unlock_irqrestore(&fown->lock, flags); return ret; } static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __ro_after_init; /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, * do nothing and return 0. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". * */ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; int result = 0; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_file = NULL; write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; } spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return result; } struct fasync_struct *fasync_alloc(void) { return kmem_cache_alloc(fasync_cache, GFP_KERNEL); } /* * NOTE! This can be used only for unused fasync entries: * entries that actually got inserted on the fasync list * need to be released by rcu - see fasync_remove_entry. */ void fasync_free(struct fasync_struct *new) { kmem_cache_free(fasync_cache, new); } /* * Insert a new entry into the fasync list. Return the pointer to the * old one if we didn't use the new one. * * NOTE! It is very important that the FASYNC flag always * match the state "is the filp on a fasync list". */ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new) { struct fasync_struct *fa, **fp; spin_lock(&filp->f_lock); spin_lock(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file != filp) continue; write_lock_irq(&fa->fa_lock); fa->fa_fd = fd; write_unlock_irq(&fa->fa_lock); goto out; } rwlock_init(&new->fa_lock); new->magic = FASYNC_MAGIC; new->fa_file = filp; new->fa_fd = fd; new->fa_next = *fapp; rcu_assign_pointer(*fapp, new); filp->f_flags |= FASYNC; out: spin_unlock(&fasync_lock); spin_unlock(&filp->f_lock); return fa; } /* * Add a fasync entry. Return negative on error, positive if * added, and zero if did nothing but change an existing one. */ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *new; new = fasync_alloc(); if (!new) return -ENOMEM; /* * fasync_insert_entry() returns the old (update) entry if * it existed. * * So free the (unused) new entry and return 0 to let the * caller know that we didn't add any new fasync entries. */ if (fasync_insert_entry(fd, filp, fapp, new)) { fasync_free(new); return 0; } return 1; } /* * fasync_helper() is used by almost all character device drivers * to set up the fasync queue, and for regular files by the file * lease code. It returns negative on error, 0 if it did no changes * and positive if it added/deleted the entry. */ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) { if (!on) return fasync_remove_entry(filp, fapp); return fasync_add_entry(fd, filp, fapp); } EXPORT_SYMBOL(fasync_helper); /* * rcu_read_lock() is held */ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = file_f_owner(fa->fa_file); if (!fown) goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } void kill_fasync(struct fasync_struct **fp, int sig, int band) { /* First a quick test without locking: usually * the list is empty. */ if (*fp) { rcu_read_lock(); kill_fasync_rcu(rcu_dereference(*fp), sig, band); rcu_read_unlock(); } } EXPORT_SYMBOL(kill_fasync); static int __init fcntl_init(void) { /* * Please add new bits here to ensure allocation uniqueness. * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } module_init(fcntl_init) |
6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal AFS stuff * * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/pagemap.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" #define AFS_CELL_MAX_ADDRS 15 struct pagevec; struct afs_call; struct afs_vnode; struct afs_server_probe; /* * Partial file-locking emulation mode. (The problem being that AFS3 only * allows whole-file locks and no upgrading/downgrading). */ enum afs_flock_mode { afs_flock_mode_unset, afs_flock_mode_local, /* Local locking only */ afs_flock_mode_openafs, /* Don't get server lock for a partial lock */ afs_flock_mode_strict, /* Always get a server lock for a partial lock */ afs_flock_mode_write, /* Get an exclusive server lock for a partial lock */ }; struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ bool no_cell; /* T if the source is "none" (for dynroot) */ enum afs_flock_mode flock_mode; /* Partial file-locking emulation mode */ afs_voltype_t type; /* type of volume requested */ unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ struct key *key; /* key to use for secure mounting */ }; enum afs_call_state { AFS_CALL_CL_REQUESTING, /* Client: Request is being sent */ AFS_CALL_CL_AWAIT_REPLY, /* Client: Awaiting reply */ AFS_CALL_CL_PROC_REPLY, /* Client: rxrpc call complete; processing reply */ AFS_CALL_SV_AWAIT_OP_ID, /* Server: Awaiting op ID */ AFS_CALL_SV_AWAIT_REQUEST, /* Server: Awaiting request data */ AFS_CALL_SV_REPLYING, /* Server: Replying */ AFS_CALL_SV_AWAIT_ACK, /* Server: Awaiting final ACK */ AFS_CALL_COMPLETE, /* Completed or failed */ }; /* * Address preferences. */ struct afs_addr_preference { union { struct in_addr ipv4_addr; /* AF_INET address to compare against */ struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */ }; sa_family_t family; /* Which address to use */ u16 prio; /* Priority */ u8 subnet_mask; /* How many bits to compare */ }; struct afs_addr_preference_list { struct rcu_head rcu; u16 version; /* Incremented when prefs list changes */ u8 ipv6_off; /* Offset of IPv6 addresses */ u8 nr; /* Number of addresses in total */ u8 max_prefs; /* Number of prefs allocated */ struct afs_addr_preference prefs[] __counted_by(max_prefs); }; struct afs_address { struct rxrpc_peer *peer; short last_error; /* Last error from this address */ u16 prio; /* Address priority */ }; /* * List of server addresses. */ struct afs_addr_list { struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ unsigned int debug_id; unsigned int addr_pref_version; /* Version of address preference list */ unsigned char max_addrs; unsigned char nr_addrs; unsigned char preferred; /* Preferred address */ unsigned char nr_ipv4; /* Number of IPv4 addresses */ enum dns_record_source source:8; enum dns_lookup_status status:8; unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ struct afs_address addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; /* * a record of an in-progress RxRPC call */ struct afs_call { const struct afs_call_type *type; /* type of call */ wait_queue_head_t waitq; /* processes awaiting completion */ struct work_struct async_work; /* async I/O processor */ struct work_struct work; /* actual work processor */ struct work_struct free_work; /* Deferred free processor */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct rxrpc_peer *peer; /* Remote endpoint */ struct key *key; /* security for this call */ struct afs_net *net; /* The network namespace */ struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ size_t iov_len; /* Size of *iter to be used */ struct iov_iter def_iter; /* Default buffer/data iterator */ struct iov_iter *write_iter; /* Iterator defining write to be made */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ struct kvec kvec[1]; struct bio_vec bvec[1]; }; void *buffer; /* reply receive buffer */ union { struct afs_endpoint_state *probe; struct afs_addr_list *vl_probe; struct afs_addr_list *ret_alist; struct afs_vldb_entry *ret_vldb; char *ret_str; }; struct afs_fid fid; /* Primary vnode ID (or all zeroes) */ unsigned char probe_index; /* Address in ->probe_alist */ struct afs_operation *op; unsigned int server_index; refcount_t ref; enum afs_call_state state; spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ unsigned char unmarshall; /* unmarshalling phase */ bool drop_ref; /* T if need to drop ref for incoming call */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ struct { __be32 tmp_u; __be32 tmp; } __attribute__((packed)); __be64 tmp64; }; ktime_t issue_time; /* Time of issue of operation */ }; struct afs_call_type { const char *name; unsigned int op; /* Really enum afs_fs_operation */ /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ int (*deliver)(struct afs_call *call); /* clean up a call */ void (*destructor)(struct afs_call *call); /* Work function */ void (*work)(struct work_struct *work); /* Call done function (gets called immediately on success or failure) */ void (*done)(struct afs_call *call); }; /* * Key available for writeback on a file. */ struct afs_wb_key { refcount_t usage; struct key *key; struct list_head vnode_link; /* Link in vnode->wb_keys */ }; /* * AFS open file information record. Pointed to by file->private_data. */ struct afs_file { struct key *key; /* The key this file was opened with */ struct afs_wb_key *wb; /* Writeback key record for this file */ }; static inline struct key *afs_file_key(struct file *file) { struct afs_file *af = file->private_data; return af->key; } /* * Record of an outstanding read operation on a vnode. */ struct afs_read { loff_t pos; /* Where to start reading */ loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ loff_t file_size; /* File size returned by server */ struct key *key; /* The key to use to reissue the read */ struct afs_vnode *vnode; /* The file being read into. */ struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */ afs_dataversion_t data_version; /* Version number returned by server */ refcount_t usage; unsigned int call_debug_id; unsigned int nr_pages; int error; void (*done)(struct afs_read *); void (*cleanup)(struct afs_read *); struct iov_iter *iter; /* Iterator representing the buffer */ struct iov_iter def_iter; /* Default iterator */ }; /* * AFS superblock private data * - there's one superblock per volume */ struct afs_super_info { struct net *net_ns; /* Network namespace */ struct afs_cell *cell; /* The cell in which the volume resides */ struct afs_volume *volume; /* volume record */ enum afs_flock_mode flock_mode:8; /* File locking emulation mode */ bool dyn_root; /* True if dynamic root */ }; static inline struct afs_super_info *AFS_FS_S(struct super_block *sb) { return sb->s_fs_info; } extern struct file_system_type afs_fs_type; /* * Set of substitutes for @sys. */ struct afs_sysnames { #define AFS_NR_SYSNAME 16 char *subs[AFS_NR_SYSNAME]; refcount_t usage; unsigned short nr; char blank[1]; }; /* * AFS network namespace record. */ struct afs_net { struct net *net; /* Backpointer to the owning net namespace */ struct afs_uuid uuid; bool live; /* F if this namespace is being removed */ /* AF_RXRPC I/O stuff */ struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; /* Cell database */ struct rb_root cells; struct afs_cell *ws_cell; struct work_struct cells_manager; struct timer_list cells_timer; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; struct mutex proc_cells_lock; struct hlist_head proc_cells; /* Known servers. Theoretically each fileserver can only be in one * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ struct rb_root fs_servers; /* afs_server (by server UUID or address) */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ seqlock_t fs_addr_lock; /* For fs_addresses[46] */ struct work_struct fs_manager; struct timer_list fs_timer; struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; /* File locking renewal management */ struct mutex lock_manager_mutex; /* Misc */ struct super_block *dynroot_sb; /* Dynamic root mount superblock */ struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */ struct afs_sysnames *sysnames; rwlock_t sysnames_lock; struct afs_addr_preference_list __rcu *address_prefs; u16 address_pref_version; /* Statistics counters */ atomic_t n_lookup; /* Number of lookups done */ atomic_t n_reval; /* Number of dentries needing revalidation */ atomic_t n_inval; /* Number of invalidations by the server */ atomic_t n_relpg; /* Number of invalidations by release_folio */ atomic_t n_read_dir; /* Number of directory pages read */ atomic_t n_dir_cr; /* Number of directory entry creation edits */ atomic_t n_dir_rm; /* Number of directory entry removal edits */ atomic_t n_stores; /* Number of store ops */ atomic_long_t n_store_bytes; /* Number of bytes stored */ atomic_long_t n_fetch_bytes; /* Number of bytes fetched */ atomic_t n_fetches; /* Number of data fetch ops */ }; extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_UNSET, AFS_CELL_ACTIVATING, AFS_CELL_ACTIVE, AFS_CELL_DEACTIVATING, AFS_CELL_INACTIVE, AFS_CELL_FAILED, AFS_CELL_REMOVED, }; /* * AFS cell record. * * This is a tricky concept to get right as it is possible to create aliases * simply by pointing AFSDB/SRV records for two names at the same set of VL * servers; it is also possible to do things like setting up two sets of VL * servers, one of which provides a superset of the volumes provided by the * other (for internal/external division, for example). * * Cells only exist in the sense that (a) a cell's name maps to a set of VL * servers and (b) a cell's name is used by the client to select the key to use * for authentication and encryption. The cell name is not typically used in * the protocol. * * Two cells are determined to be aliases if they have an explicit alias (YFS * only), share any VL servers in common or have at least one volume in common. * "In common" means that the address list of the VL servers or the fileservers * share at least one endpoint. */ struct afs_cell { union { struct rcu_head rcu; struct rb_node net_node; /* Node in net->cells */ }; struct afs_net *net; struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ refcount_t ref; /* Struct refcount */ atomic_t active; /* Active usage counter */ unsigned long flags; #define AFS_CELL_FL_NO_GC 0 /* The cell was added manually, don't auto-gc */ #define AFS_CELL_FL_DO_LOOKUP 1 /* DNS lookup requested */ #define AFS_CELL_FL_CHECK_ALIAS 2 /* Need to check for aliases */ enum afs_cell_state state; short error; enum dns_record_source dns_source:8; /* Latest source of data from lookup */ enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; /* The volumes belonging to this cell */ struct rw_semaphore vs_lock; /* Lock for server->volumes */ struct rb_root volumes; /* Tree of volumes on this server */ struct hlist_head proc_volumes; /* procfs volume list */ seqlock_t volume_lock; /* For volumes */ /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ seqlock_t fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ struct afs_vlserver_list __rcu *vl_servers; u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ }; /* * Volume Location server record. */ struct afs_vlserver { struct rcu_head rcu; struct afs_addr_list __rcu *addresses; /* List of addresses for this VL server */ unsigned long flags; #define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */ #define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */ #define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */ #define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */ rwlock_t lock; /* Lock on addresses */ refcount_t ref; unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Probe state */ wait_queue_head_t probe_wq; atomic_t probe_outstanding; spinlock_t probe_lock; struct { unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ u32 abort_code; short error; unsigned short flags; #define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */ #define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */ #define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */ #define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */ } probe; u16 service_id; /* Service ID we're using */ u16 port; u16 name_len; /* Length of name */ char name[]; /* Server name, case-flattened */ }; /* * Weighted list of Volume Location servers. */ struct afs_vlserver_entry { u16 priority; /* Preference (as SRV) */ u16 weight; /* Weight (as SRV) */ enum dns_record_source source:8; enum dns_lookup_status status:8; struct afs_vlserver *server; }; struct afs_vlserver_list { struct rcu_head rcu; refcount_t ref; u8 nr_servers; u8 index; /* Server currently in use */ u8 preferred; /* Preferred server */ enum dns_record_source source:8; enum dns_lookup_status status:8; rwlock_t lock; struct afs_vlserver_entry servers[]; }; /* * Cached VLDB entry. * * This is pointed to by cell->vldb_entries, indexed by name. */ struct afs_vldb_entry { afs_volid_t vid[3]; /* Volume IDs for R/W, R/O and Bak volumes */ unsigned long flags; #define AFS_VLDB_HAS_RW 0 /* - R/W volume exists */ #define AFS_VLDB_HAS_RO 1 /* - R/O volume exists */ #define AFS_VLDB_HAS_BAK 2 /* - Backup volume exists */ #define AFS_VLDB_QUERY_VALID 3 /* - Record is valid */ #define AFS_VLDB_QUERY_ERROR 4 /* - VL server returned error */ uuid_t fs_server[AFS_NMAXNSERVERS]; u32 addr_version[AFS_NMAXNSERVERS]; /* Registration change counters */ u8 fs_mask[AFS_NMAXNSERVERS]; #define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */ #define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */ #define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */ u8 vlsf_flags[AFS_NMAXNSERVERS]; short error; u8 nr_servers; /* Number of server records */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; /* * Fileserver endpoint state. The records the addresses of a fileserver's * endpoints and the state and result of a round of probing on them. This * allows the rotation algorithm to access those results without them being * erased by a subsequent round of probing. */ struct afs_endpoint_state { struct rcu_head rcu; struct afs_addr_list *addresses; /* The addresses being probed */ unsigned long responsive_set; /* Bitset of responsive endpoints */ unsigned long failed_set; /* Bitset of endpoints we failed to probe */ refcount_t ref; unsigned int server_id; /* Debug ID of server */ unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */ atomic_t nr_probing; /* Number of outstanding probes */ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ s32 abort_code; short error; unsigned long flags; #define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */ #define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */ #define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */ #define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */ #define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */ }; /* * Record of fileserver with which we're actively communicating. */ struct afs_server { struct rcu_head rcu; union { uuid_t uuid; /* Server ID */ struct afs_uuid _uuid; }; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node uuid_rb; /* Link in net->fs_servers */ struct afs_server __rcu *uuid_next; /* Next server with same UUID */ struct afs_server *uuid_prev; /* Previous server with same UUID */ struct list_head probe_link; /* Link in net->fs_probe_list */ struct hlist_node addr_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ struct afs_server *gc_next; /* Next server in manager's list */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ #define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ #define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ #define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ #define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ refcount_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u16 service_id; /* Service ID we're using. */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ /* file service access */ rwlock_t fs_lock; /* access lock */ /* Probe state */ struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */ unsigned long probed_at; /* Time last probe was dispatched (jiffies) */ wait_queue_head_t probe_wq; unsigned int probe_counter; /* Number of probes issued */ spinlock_t probe_lock; }; enum afs_ro_replicating { AFS_RO_NOT_REPLICATING, /* Not doing replication */ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */ } __mode(byte); /* * Replaceable volume server list. */ struct afs_server_entry { struct afs_server *server; struct afs_volume *volume; struct list_head slink; /* Link in server->volumes */ time64_t cb_expires_at; /* Time at which volume-level callback expires */ unsigned long flags; #define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */ #define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */ #define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */ }; struct afs_server_list { struct rcu_head rcu; refcount_t usage; bool attached; /* T if attached to servers */ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */ unsigned char nr_servers; unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */ unsigned int seq; /* Set to ->servers_seq when installed */ rwlock_t lock; struct afs_server_entry servers[]; }; /* * Live AFS volume management. */ struct afs_volume { struct rcu_head rcu; afs_volid_t vid; /* The volume ID of this volume */ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ struct hlist_node proc_link; /* Link in cell->proc_volumes */ struct super_block __rcu *sb; /* Superblock on which inodes reside */ struct work_struct destructor; /* Deferred destructor */ unsigned long flags; #define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */ #define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */ #define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */ #define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */ #define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */ #define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ unsigned int servers_seq; /* Incremented each time ->servers changes */ /* RO release tracking */ struct mutex volsync_lock; /* Time/state evaluation lock */ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */ time64_t update_time; /* Volume update time (or TIME64_MIN) */ /* Callback management */ struct mutex cb_check_lock; /* Lock to control race to check after v_break */ time64_t cb_expires_at; /* Earliest volume callback expiry time */ atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */ atomic_t cb_v_break; /* Volume-break event counter. */ atomic_t cb_v_check; /* Volume-break has-been-checked counter. */ atomic_t cb_scrub; /* Scrub-all-data event counter. */ rwlock_t cb_v_break_lock; struct rw_semaphore open_mmaps_lock; struct list_head open_mmaps; /* List of vnodes that are mmapped */ afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ u8 name_len; u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; enum afs_lock_state { AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ AFS_VNODE_LOCK_DELETED, /* The vnode has been deleted whilst we have a lock */ }; /* * AFS inode private data. * * Note that afs_alloc_inode() *must* reset anything that could incorrectly * leak from one inode to another. */ struct afs_vnode { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ struct key *silly_key; /* Silly rename key */ spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t lock; /* waitqueue/flags lock */ unsigned long flags; #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ #define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ struct key *lock_key; /* Key to be used in lock ops */ ktime_t locked_at; /* Time at which lock obtained */ enum afs_lock_state lock_state : 8; afs_lock_type_t lock_type : 8; /* outstanding callback notification on this file */ struct work_struct cb_work; /* Work for mmap'd files */ struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ atomic_t cb_nr_mmap; /* Number of mmaps */ unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */ unsigned int cb_scrub; /* Scrub counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ unsigned int cb_v_check; /* Break check counter on ->volume */ seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */ atomic64_t cb_expires_at; /* time at which callback expires */ #define AFS_NO_CB_PROMISE TIME64_MIN }; static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif } static inline void afs_vnode_set_cache(struct afs_vnode *vnode, struct fscache_cookie *cookie) { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; if (cookie) mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } /* * cached security record for one user's attempt to access a vnode */ struct afs_permit { struct key *key; /* RxRPC ticket holding a security context */ afs_access_t access; /* CallerAccess value for this key */ }; /* * Immutable cache of CallerAccess records from attempts to access vnodes. * These may be shared between multiple vnodes. */ struct afs_permits { struct rcu_head rcu; struct hlist_node hash_node; /* Link in hash */ unsigned long h; /* Hash value for this permit list */ refcount_t usage; unsigned short nr_permits; /* Number of records */ bool invalidated; /* Invalidated due to key change */ struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */ }; /* * Error prioritisation and accumulation. */ struct afs_error { s32 abort_code; /* Cumulative abort code */ short error; /* Cumulative error */ bool responded; /* T if server responded */ bool aborted; /* T if ->error is from an abort */ }; /* * Cursor for iterating over a set of volume location servers. */ struct afs_vl_cursor { struct afs_cell *cell; /* The cell we're querying */ struct afs_vlserver_list *server_list; /* Current server list (pins ref) */ struct afs_vlserver *server; /* Server on which this resides */ struct afs_addr_list *alist; /* Current address list (pins ref) */ struct key *key; /* Key for the server */ unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ struct afs_error cumul_error; /* Cumulative error */ unsigned int debug_id; s32 call_abort_code; short call_error; /* Error from single call */ short server_index; /* Current server */ signed char addr_index; /* Current address */ unsigned short flags; #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ short nr_iterations; /* Number of server iterations */ bool call_responded; /* T if the current address responded */ }; /* * Fileserver state tracking for an operation. An array of these is kept, * indexed by server index. */ struct afs_server_state { /* Tracking of fileserver probe state. Other operations may interfere * by probing a fileserver when accessing other volumes. */ unsigned int probe_seq; unsigned long untried_addrs; /* Addresses we haven't tried yet */ struct wait_queue_entry probe_waiter; struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */ }; /* * Fileserver operation methods. */ struct afs_operation_ops { void (*issue_afs_rpc)(struct afs_operation *op); void (*issue_yfs_rpc)(struct afs_operation *op); void (*success)(struct afs_operation *op); void (*aborted)(struct afs_operation *op); void (*failed)(struct afs_operation *op); void (*edit_dir)(struct afs_operation *op); void (*put)(struct afs_operation *op); }; struct afs_vnode_param { struct afs_vnode *vnode; struct afs_fid fid; /* Fid to access */ struct afs_status_cb scb; /* Returned status and callback promise */ afs_dataversion_t dv_before; /* Data version before the call */ unsigned int cb_break_before; /* cb_break before the call */ u8 dv_delta; /* Expected change in data version */ bool put_vnode:1; /* T if we have a ref on the vnode */ bool need_io_lock:1; /* T if we need the I/O lock on this */ bool update_ctime:1; /* Need to update the ctime */ bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ bool modification:1; /* Set if the content gets modified */ }; /* * Fileserver operation wrapper, handling server and address rotation * asynchronously. May make simultaneous calls to multiple servers. */ struct afs_operation { struct afs_net *net; /* Network namespace */ struct key *key; /* Key for the cell */ const struct afs_call_type *type; /* Type of call done */ const struct afs_operation_ops *ops; /* Parameters/results for the operation */ struct afs_volume *volume; /* Volume being accessed */ struct afs_vnode_param file[2]; struct afs_vnode_param *more_files; struct afs_volsync pre_volsync; /* Volsync before op */ struct afs_volsync volsync; /* Volsync returned by op */ struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ struct timespec64 ctime; /* Change time to set */ struct afs_error cumul_error; /* Cumulative error */ short nr_files; /* Number of entries in file[], more_files */ unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ union { struct { int which; /* Which ->file[] to fetch for */ } fetch_status; struct { int reason; /* enum afs_edit_dir_reason */ mode_t mode; const char *symlink; } create; struct { bool need_rehash; } unlink; struct { struct dentry *rehash; struct dentry *tmp; bool new_negative; } rename; struct { struct afs_read *req; } fetch; struct { afs_lock_type_t type; } lock; struct { struct iov_iter *write_iter; loff_t pos; loff_t size; loff_t i_size; } store; struct { struct iattr *attr; loff_t old_i_size; } setattr; struct afs_acl *acl; struct yfs_acl *yacl; struct { struct afs_volume_status vs; struct kstatfs *buf; } volstatus; }; /* Fileserver iteration state */ struct afs_server_list *server_list; /* Current server list (pins ref) */ struct afs_server *server; /* Server we're using (ref pinned by server_list) */ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */ struct afs_server_state *server_states; /* States of the servers involved */ struct afs_call *call; unsigned long untried_servers; /* Bitmask of untried servers */ unsigned long addr_tried; /* Tried addresses */ s32 call_abort_code; /* Abort code from single call */ short call_error; /* Error from single call */ short server_index; /* Current server */ short nr_iterations; /* Number of server iterations */ signed char addr_index; /* Current address */ bool call_responded; /* T if the current address responded */ unsigned int flags; #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ #define AFS_OPERATION_VBUSY 0x0002 /* Set if seen VBUSY */ #define AFS_OPERATION_VMOVED 0x0004 /* Set if seen VMOVED */ #define AFS_OPERATION_VNOVOL 0x0008 /* Set if seen VNOVOL */ #define AFS_OPERATION_CUR_ONLY 0x0010 /* Set if current server only (file lock held) */ #define AFS_OPERATION_NO_VSLEEP 0x0020 /* Set to prevent sleep on VBUSY, VOFFLINE, ... */ #define AFS_OPERATION_UNINTR 0x0040 /* Set if op is uninterruptible */ #define AFS_OPERATION_DOWNGRADE 0x0080 /* Set to retry with downgraded opcode */ #define AFS_OPERATION_LOCK_0 0x0100 /* Set if have io_lock on file[0] */ #define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ #define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ }; /* * Cache auxiliary data. */ struct afs_vnode_cache_aux { __be64 data_version; } __packed; static inline void afs_set_cache_aux(struct afs_vnode *vnode, struct afs_vnode_cache_aux *aux) { aux->data_version = cpu_to_be64(vnode->status.data_version); } static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) { struct afs_vnode_cache_aux aux; afs_set_cache_aux(vnode, &aux); fscache_invalidate(afs_vnode_cache(vnode), &aux, i_size_read(&vnode->netfs.inode), flags); } #include <trace/events/afs.h> /*****************************************************************************/ /* * addr_list.c */ struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr); extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason); extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, const char *, size_t, char, unsigned short, unsigned short); bool afs_addr_list_same(const struct afs_addr_list *a, const struct afs_addr_list *b); extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, __be32 xdr, u16 port); extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, __be32 *xdr, u16 port); /* * addr_prefs.c */ int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size); void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist); void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist); /* * callback.c */ extern void afs_invalidate_mmap_work(struct work_struct *); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback_break *); static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode) { return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub; } static inline bool afs_cb_is_broken(unsigned int cb_break, const struct afs_vnode *vnode) { return cb_break != (vnode->cb_break + atomic_read(&vnode->volume->cb_ro_snapshot) + atomic_read(&vnode->volume->cb_scrub)); } /* * cell.c */ extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, const char *, bool); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_manage_cells(struct work_struct *); extern void afs_cells_timer(struct timer_list *); extern void __net_exit afs_cell_purge(struct afs_net *); /* * cmservice.c */ extern bool afs_cm_incoming_call(struct afs_call *); /* * dir.c */ extern const struct file_operations afs_dir_file_operations; extern const struct inode_operations afs_dir_inode_operations; extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; extern void afs_d_release(struct dentry *); extern void afs_check_for_remote_deletion(struct afs_operation *); /* * dir_edit.c */ extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); /* * dir_silly.c */ extern int afs_sillyrename(struct afs_vnode *, struct afs_vnode *, struct dentry *, struct key *); extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); extern int afs_dynroot_populate(struct super_block *); extern void afs_dynroot_depopulate(struct super_block *); /* * file.c */ extern const struct address_space_operations afs_file_aops; extern const struct address_space_operations afs_symlink_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); static inline struct afs_read *afs_get_read(struct afs_read *req) { refcount_inc(&req->usage); return req; } /* * flock.c */ extern struct workqueue_struct *afs_lock_manager; extern void afs_lock_op_done(struct afs_call *); extern void afs_lock_work(struct work_struct *); extern void afs_lock_may_be_available(struct afs_vnode *); extern int afs_lock(struct file *, int, struct file_lock *); extern int afs_flock(struct file *, int, struct file_lock *); /* * fsclient.c */ extern void afs_fs_fetch_status(struct afs_operation *); extern void afs_fs_fetch_data(struct afs_operation *); extern void afs_fs_create_file(struct afs_operation *); extern void afs_fs_make_dir(struct afs_operation *); extern void afs_fs_remove_file(struct afs_operation *); extern void afs_fs_remove_dir(struct afs_operation *); extern void afs_fs_link(struct afs_operation *); extern void afs_fs_symlink(struct afs_operation *); extern void afs_fs_rename(struct afs_operation *); extern void afs_fs_store_data(struct afs_operation *); extern void afs_fs_setattr(struct afs_operation *); extern void afs_fs_get_volume_status(struct afs_operation *); extern void afs_fs_set_lock(struct afs_operation *); extern void afs_fs_extend_lock(struct afs_operation *); extern void afs_fs_release_lock(struct afs_operation *); int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, struct afs_address *addr, struct key *key); bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, struct afs_endpoint_state *estate, unsigned int addr_index, struct key *key); extern void afs_fs_inline_bulk_status(struct afs_operation *); struct afs_acl { u32 size; u8 data[] __counted_by(size); }; extern void afs_fs_fetch_acl(struct afs_operation *); extern void afs_fs_store_acl(struct afs_operation *); /* * fs_operation.c */ extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); extern int afs_put_operation(struct afs_operation *); extern bool afs_begin_vnode_operation(struct afs_operation *); extern void afs_wait_for_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *); static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, struct afs_vnode *vnode) { op->file[n].vnode = vnode; op->file[n].need_io_lock = true; } static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n, const struct afs_fid *fid) { op->file[n].fid = *fid; } /* * fs_probe.c */ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, struct afs_addr_list *new_addrs, struct key *key); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate, unsigned long exclude, bool is_intr); extern void afs_fs_probe_cleanup(struct afs_net *); /* * inode.c */ extern const struct afs_operation_ops afs_fetch_status_operation; extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); /* * main.c */ extern struct workqueue_struct *afs_wq; extern int afs_net_id; static inline struct afs_net *afs_net(struct net *net) { return net_generic(net, afs_net_id); } static inline struct afs_net *afs_sb2net(struct super_block *sb) { return afs_net(AFS_FS_S(sb)->net_ns); } static inline struct afs_net *afs_d2net(struct dentry *dentry) { return afs_sb2net(dentry->d_sb); } static inline struct afs_net *afs_i2net(struct inode *inode) { return afs_sb2net(inode->i_sb); } static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) { return net_generic(sock_net(sk), afs_net_id); } static inline void __afs_stat(atomic_t *s) { atomic_inc(s); } #define afs_stat_v(vnode, n) __afs_stat(&afs_v2net(vnode)->n) /* * misc.c */ extern int afs_abort_to_error(u32); extern void afs_prioritise_error(struct afs_error *, int, u32); static inline void afs_op_nomem(struct afs_operation *op) { op->cumul_error.error = -ENOMEM; } static inline int afs_op_error(const struct afs_operation *op) { return op->cumul_error.error; } static inline s32 afs_op_abort_code(const struct afs_operation *op) { return op->cumul_error.abort_code; } static inline int afs_op_set_error(struct afs_operation *op, int error) { return op->cumul_error.error = error; } static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) { afs_prioritise_error(&op->cumul_error, error, abort_code); } /* * mntpt.c */ extern const struct inode_operations afs_mntpt_inode_operations; extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct vfsmount *afs_d_automount(struct path *); extern void afs_mntpt_kill_timer(void); /* * proc.c */ #ifdef CONFIG_PROC_FS extern int __net_init afs_proc_init(struct afs_net *); extern void __net_exit afs_proc_cleanup(struct afs_net *); extern int afs_proc_cell_setup(struct afs_cell *); extern void afs_proc_cell_remove(struct afs_cell *); extern void afs_put_sysnames(struct afs_sysnames *); #else static inline int afs_proc_init(struct afs_net *net) { return 0; } static inline void afs_proc_cleanup(struct afs_net *net) {} static inline int afs_proc_cell_setup(struct afs_cell *cell) { return 0; } static inline void afs_proc_cell_remove(struct afs_cell *cell) {} static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {} #endif /* * rotate.c */ void afs_clear_server_states(struct afs_operation *op); extern bool afs_select_fileserver(struct afs_operation *); extern void afs_dump_edestaddrreq(const struct afs_operation *); /* * rxrpc.c */ extern struct workqueue_struct *afs_async_calls; extern int __net_init afs_open_socket(struct afs_net *); extern void __net_exit afs_close_socket(struct afs_net *); extern void afs_charge_preallocation(struct work_struct *); extern void afs_put_call(struct afs_call *); void afs_deferred_put_call(struct afs_call *call); void afs_make_call(struct afs_call *call, gfp_t gfp); void afs_wait_for_call_to_complete(struct afs_call *call); extern struct afs_call *afs_alloc_flat_call(struct afs_net *, const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); extern int afs_extract_data(struct afs_call *, bool); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, gfp_t gfp) { struct afs_addr_list *alist = op->estate->addresses; op->call = call; op->type = call->type; call->op = op; call->key = op->key; call->intr = !(op->flags & AFS_OPERATION_UNINTR); call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer); call->service_id = op->server->service_id; afs_make_call(call, gfp); } static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) { call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size); } static inline void afs_extract_to_tmp(struct afs_call *call) { call->iov_len = sizeof(call->tmp); afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); } static inline void afs_extract_to_tmp64(struct afs_call *call) { call->iov_len = sizeof(call->tmp64); afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); } static inline void afs_extract_discard(struct afs_call *call, size_t size) { call->iov_len = size; iov_iter_discard(&call->def_iter, ITER_DEST, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) { call->iov_len = size; afs_extract_begin(call, call->buffer, size); } static inline int afs_transfer_reply(struct afs_call *call) { return afs_extract_data(call, false); } static inline bool afs_check_call_state(struct afs_call *call, enum afs_call_state state) { return READ_ONCE(call->state) == state; } static inline bool afs_set_call_state(struct afs_call *call, enum afs_call_state from, enum afs_call_state to) { bool ok = false; spin_lock_bh(&call->state_lock); if (call->state == from) { call->state = to; trace_afs_call_state(call, from, to, 0, 0); ok = true; } spin_unlock_bh(&call->state_lock); return ok; } static inline void afs_set_call_complete(struct afs_call *call, int error, u32 remote_abort) { enum afs_call_state state; bool ok = false; spin_lock_bh(&call->state_lock); state = call->state; if (state != AFS_CALL_COMPLETE) { call->abort_code = remote_abort; call->error = error; call->state = AFS_CALL_COMPLETE; trace_afs_call_state(call, state, AFS_CALL_COMPLETE, error, remote_abort); ok = true; } spin_unlock_bh(&call->state_lock); if (ok) { trace_afs_call_done(call); /* Asynchronous calls have two refs to release - one from the alloc and * one queued with the work item - and we can't just deallocate the * call because the work item may be queued again. */ if (call->drop_ref) afs_put_call(call); } } /* * security.c */ extern void afs_put_permits(struct afs_permits *); extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int, struct afs_status_cb *); extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct mnt_idmap *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* * server.c */ extern spinlock_t afs_server_peer_lock; extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); extern void afs_manage_servers(struct work_struct *); extern void afs_servers_timer(struct timer_list *); extern void afs_fs_probe_timer(struct timer_list *); extern void __net_exit afs_purge_servers(struct afs_net *); bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); static inline void afs_inc_servers_outstanding(struct afs_net *net) { atomic_inc(&net->servers_outstanding); } static inline void afs_dec_servers_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->servers_outstanding)) wake_up_var(&net->servers_outstanding); } static inline bool afs_is_probing_server(struct afs_server *server) { return list_empty(&server->probe_link); } /* * server_list.c */ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list *slist) { refcount_inc(&slist->usage); return slist; } extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *); struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, struct key *key, struct afs_vldb_entry *vldb); extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *); void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist); void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist, struct afs_server_list *old); void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist); /* * super.c */ extern int __init afs_fs_init(void); extern void afs_fs_exit(void); /* * validation.c */ bool afs_check_validity(const struct afs_vnode *vnode); int afs_update_volume_state(struct afs_operation *op); int afs_validate(struct afs_vnode *vnode, struct key *key); /* * vlclient.c */ extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *, const char *, int); extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *); struct afs_call *afs_vl_get_capabilities(struct afs_net *net, struct afs_addr_list *alist, unsigned int addr_index, struct key *key, struct afs_vlserver *server, unsigned int server_index); extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *); extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *); /* * vl_alias.c */ extern int afs_cell_detect_alias(struct afs_cell *, struct key *); /* * vl_probe.c */ extern void afs_vlserver_probe_result(struct afs_call *); extern int afs_send_vl_probes(struct afs_net *, struct key *, struct afs_vlserver_list *); extern int afs_wait_for_vl_probes(struct afs_vlserver_list *, unsigned long); /* * vl_rotate.c */ extern bool afs_begin_vlserver_operation(struct afs_vl_cursor *, struct afs_cell *, struct key *); extern bool afs_select_vlserver(struct afs_vl_cursor *); extern bool afs_select_current_vlserver(struct afs_vl_cursor *); extern int afs_end_vlserver_operation(struct afs_vl_cursor *); /* * vlserver_list.c */ static inline struct afs_vlserver *afs_get_vlserver(struct afs_vlserver *vlserver) { refcount_inc(&vlserver->ref); return vlserver; } static inline struct afs_vlserver_list *afs_get_vlserverlist(struct afs_vlserver_list *vllist) { if (vllist) refcount_inc(&vllist->ref); return vllist; } extern struct afs_vlserver *afs_alloc_vlserver(const char *, size_t, unsigned short); extern void afs_put_vlserver(struct afs_net *, struct afs_vlserver *); extern struct afs_vlserver_list *afs_alloc_vlserver_list(unsigned int); extern void afs_put_vlserverlist(struct afs_net *, struct afs_vlserver_list *); extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, const void *, size_t); /* * volume.c */ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ void afs_prepare_write(struct netfs_io_subrequest *subreq); void afs_issue_write(struct netfs_io_subrequest *subreq); void afs_begin_writeback(struct netfs_io_request *wreq); void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); /* * xattr.c */ extern const struct xattr_handler * const afs_xattr_handlers[]; /* * yfsclient.c */ extern void yfs_fs_fetch_data(struct afs_operation *); extern void yfs_fs_create_file(struct afs_operation *); extern void yfs_fs_make_dir(struct afs_operation *); extern void yfs_fs_remove_file2(struct afs_operation *); extern void yfs_fs_remove_file(struct afs_operation *); extern void yfs_fs_remove_dir(struct afs_operation *); extern void yfs_fs_link(struct afs_operation *); extern void yfs_fs_symlink(struct afs_operation *); extern void yfs_fs_rename(struct afs_operation *); extern void yfs_fs_store_data(struct afs_operation *); extern void yfs_fs_setattr(struct afs_operation *); extern void yfs_fs_get_volume_status(struct afs_operation *); extern void yfs_fs_set_lock(struct afs_operation *); extern void yfs_fs_extend_lock(struct afs_operation *); extern void yfs_fs_release_lock(struct afs_operation *); extern void yfs_fs_fetch_status(struct afs_operation *); extern void yfs_fs_inline_bulk_status(struct afs_operation *); struct yfs_acl { struct afs_acl *acl; /* Dir/file/symlink ACL */ struct afs_acl *vol_acl; /* Whole volume ACL */ u32 inherit_flag; /* True if ACL is inherited from parent dir */ u32 num_cleaned; /* Number of ACEs removed due to subject removal */ unsigned int flags; #define YFS_ACL_WANT_ACL 0x01 /* Set if caller wants ->acl */ #define YFS_ACL_WANT_VOL_ACL 0x02 /* Set if caller wants ->vol_acl */ }; extern void yfs_free_opaque_acl(struct yfs_acl *); extern void yfs_fs_fetch_opaque_acl(struct afs_operation *); extern void yfs_fs_store_opaque_acl2(struct afs_operation *); /* * Miscellaneous inline functions. */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { return &vnode->netfs.inode; } /* * Note that a dentry got changed. We need to set d_fsdata to the data version * number derived from the result of the operation. It doesn't matter if * d_fsdata goes backwards as we'll just revalidate. */ static inline void afs_update_dentry_version(struct afs_operation *op, struct afs_vnode_param *dir_vp, struct dentry *dentry) { if (!op->cumul_error.error) dentry->d_fsdata = (void *)(unsigned long)dir_vp->scb.status.data_version; } /* * Set the file size and block count. Estimate the number of 512 bytes blocks * used, rounded up to nearest 1K for consistency with other AFS clients. */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { i_size_write(&vnode->netfs.inode, size); vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). */ static inline void afs_check_dir_conflict(struct afs_operation *op, struct afs_vnode_param *dvp) { if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version) op->flags |= AFS_OPERATION_DIR_CONFLICT; } static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) { trace_afs_io_error(call->debug_id, -EIO, where); return -EIO; } static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where) { trace_afs_file_error(vnode, -EIO, where); return -EIO; } /*****************************************************************************/ /* * debug tracing */ extern unsigned afs_debug; #define dbgprintk(FMT,...) \ printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__) #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) #elif defined(CONFIG_AFS_DEBUG) #define AFS_DEBUG_KENTER 0x01 #define AFS_DEBUG_KLEAVE 0x02 #define AFS_DEBUG_KDEBUG 0x04 #define _enter(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KENTER)) \ kenter(FMT,##__VA_ARGS__); \ } while (0) #define _leave(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KLEAVE)) \ kleave(FMT,##__VA_ARGS__); \ } while (0) #define _debug(FMT,...) \ do { \ if (unlikely(afs_debug & AFS_DEBUG_KDEBUG)) \ kdebug(FMT,##__VA_ARGS__); \ } while (0) #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) #endif /* * debug assertion checking */ #if 1 // defined(__KDEBUGALL) #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ if (unlikely(!((L) OP1 (N)) || !((N) OP2 (H)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu "#OP1" %lu "#OP2" %lu is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ printk(KERN_ERR "0x%lx "#OP1" 0x%lx "#OP2" 0x%lx is false\n", \ (unsigned long)(L), (unsigned long)(N), \ (unsigned long)(H)); \ BUG(); \ } \ } while(0) #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ BUG(); \ } \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ printk(KERN_ERR "\n"); \ printk(KERN_ERR "AFS: Assertion failed\n"); \ printk(KERN_ERR "%lu " #OP " %lu is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ } while(0) #else #define ASSERT(X) \ do { \ } while(0) #define ASSERTCMP(X, OP, Y) \ do { \ } while(0) #define ASSERTRANGE(L, OP1, N, OP2, H) \ do { \ } while(0) #define ASSERTIF(C, X) \ do { \ } while(0) #define ASSERTIFCMP(C, X, OP, Y) \ do { \ } while(0) #endif /* __KDEBUGALL */ |
35 35 4 3 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 | // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/ethtool.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/bonding.h> #include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; bond_for_each_slave_rcu(bond, slave, iter) if (++off == *pos) return slave; return NULL; } static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; ++*pos; if (v == SEQ_START_TOKEN) return bond_first_slave_rcu(bond); bond_for_each_slave_rcu(bond, slave, iter) { if (found) return slave; if (slave == v) found = true; } return NULL; } static void bond_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; curr = rcu_dereference(bond->curr_active_slave); seq_printf(seq, "Bonding Mode: %s", bond_mode_name(BOND_MODE(bond))); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac) { optval = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); seq_printf(seq, " (fail_over_mac %s)", optval->string); } seq_printf(seq, "\n"); if (bond_mode_uses_xmit_hash(bond)) { optval = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); seq_printf(seq, "Transmit Hash Policy: %s (%d)\n", optval->string, bond->params.xmit_policy); } if (bond_uses_primary(bond)) { primary = rcu_dereference(bond->primary_slave); seq_printf(seq, "Primary Slave: %s", primary ? primary->dev->name : "None"); if (primary) { optval = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); seq_printf(seq, " (primary_reselect %s)", optval->string); } seq_printf(seq, "\nCurrently Active Slave: %s\n", (curr) ? curr->dev->name : "None"); } seq_printf(seq, "MII Status: %s\n", netif_carrier_ok(bond->dev) ? "up" : "down"); seq_printf(seq, "MII Polling Interval (ms): %d\n", bond->params.miimon); seq_printf(seq, "Up Delay (ms): %d\n", bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); seq_printf(seq, "Peer Notification Delay (ms): %d\n", bond->params.peer_notif_delay * bond->params.miimon); /* ARP information */ if (bond->params.arp_interval > 0) { int printed = 0; seq_printf(seq, "ARP Polling Interval (ms): %d\n", bond->params.arp_interval); seq_printf(seq, "ARP Missed Max: %u\n", bond->params.missed_max); seq_printf(seq, "ARP IP target/s (n.n.n.n form):"); for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { if (!bond->params.arp_targets[i]) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI4", &bond->params.arp_targets[i]); printed = 1; } seq_printf(seq, "\n"); #if IS_ENABLED(CONFIG_IPV6) printed = 0; seq_printf(seq, "NS IPv6 target/s (xx::xx form):"); for (i = 0; (i < BOND_MAX_NS_TARGETS); i++) { if (ipv6_addr_any(&bond->params.ns_targets[i])) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI6c", &bond->params.ns_targets[i]); printed = 1; } seq_printf(seq, "\n"); #endif } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP active: %s\n", (bond->params.lacp_active) ? "on" : "off"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); optval = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", optval->string); if (capable(CAP_NET_ADMIN)) { seq_printf(seq, "System priority: %d\n", BOND_AD_INFO(bond).system.sys_priority); seq_printf(seq, "System MAC address: %pM\n", &BOND_AD_INFO(bond).system.sys_mac_addr); if (__bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", bond->dev->name); } else { seq_printf(seq, "Active Aggregator Info:\n"); seq_printf(seq, "\tAggregator ID: %d\n", ad_info.aggregator_id); seq_printf(seq, "\tNumber of ports: %d\n", ad_info.ports); seq_printf(seq, "\tActor Key: %d\n", ad_info.actor_key); seq_printf(seq, "\tPartner Key: %d\n", ad_info.partner_key); seq_printf(seq, "\tPartner Mac Address: %pM\n", ad_info.partner_system); } } } } static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); if (slave->speed == SPEED_UNKNOWN) seq_printf(seq, "Speed: %s\n", "Unknown"); else seq_printf(seq, "Speed: %d Mbps\n", slave->speed); if (slave->duplex == DUPLEX_UNKNOWN) seq_printf(seq, "Duplex: %s\n", "Unknown"); else seq_printf(seq, "Duplex: %s\n", slave->duplex ? "full" : "half"); seq_printf(seq, "Link Failure Count: %u\n", slave->link_failure_count); seq_printf(seq, "Permanent HW addr: %*phC\n", slave->dev->addr_len, slave->perm_hwaddr); seq_printf(seq, "Slave queue ID: %d\n", READ_ONCE(slave->queue_id)); if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; const struct aggregator *agg = port->aggregator; if (agg) { seq_printf(seq, "Aggregator ID: %d\n", agg->aggregator_identifier); seq_printf(seq, "Actor Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_actor_state)); seq_printf(seq, "Partner Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_partner_state)); seq_printf(seq, "Actor Churned Count: %d\n", port->churn_actor_count); seq_printf(seq, "Partner Churned Count: %d\n", port->churn_partner_count); if (capable(CAP_NET_ADMIN)) { seq_puts(seq, "details actor lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->actor_system_priority); seq_printf(seq, " system mac address: %pM\n", &port->actor_system); seq_printf(seq, " port key: %d\n", port->actor_oper_port_key); seq_printf(seq, " port priority: %d\n", port->actor_port_priority); seq_printf(seq, " port number: %d\n", port->actor_port_number); seq_printf(seq, " port state: %d\n", port->actor_oper_port_state); seq_puts(seq, "details partner lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->partner_oper.system_priority); seq_printf(seq, " system mac address: %pM\n", &port->partner_oper.system); seq_printf(seq, " oper key: %d\n", port->partner_oper.key); seq_printf(seq, " port priority: %d\n", port->partner_oper.port_priority); seq_printf(seq, " port number: %d\n", port->partner_oper.port_number); seq_printf(seq, " port state: %d\n", port->partner_oper.port_state); } } else { seq_puts(seq, "Aggregator ID: N/A\n"); } } } static int bond_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "%s\n", bond_version); bond_info_show_master(seq); } else bond_info_show_slave(seq, v); return 0; } static const struct seq_operations bond_info_seq_ops = { .start = bond_info_seq_start, .next = bond_info_seq_next, .stop = bond_info_seq_stop, .show = bond_info_seq_show, }; void bond_create_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir) { bond->proc_entry = proc_create_seq_data(bond_dev->name, 0444, bn->proc_dir, &bond_info_seq_ops, bond); if (bond->proc_entry == NULL) netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n", DRV_NAME, bond_dev->name); else memcpy(bond->proc_file_name, bond_dev->name, IFNAMSIZ); } } void bond_remove_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir && bond->proc_entry) { remove_proc_entry(bond->proc_file_name, bn->proc_dir); memset(bond->proc_file_name, 0, IFNAMSIZ); bond->proc_entry = NULL; } } /* Create the bonding directory under /proc/net, if doesn't exist yet. * Caller must hold rtnl_lock. */ void __net_init bond_create_proc_dir(struct bond_net *bn) { if (!bn->proc_dir) { bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); if (!bn->proc_dir) pr_warn("Warning: Cannot create /proc/net/%s\n", DRV_NAME); } } /* Destroy the bonding directory under /proc/net, if empty. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { if (bn->proc_dir) { remove_proc_entry(DRV_NAME, bn->net->proc_net); bn->proc_dir = NULL; } } |
3 3 3 3 11 12 12 5 5 5 5 5 16 16 16 16 14 14 12 14 34 33 34 34 34 34 34 18 18 18 18 18 18 9 9 9 9 9 9 10 10 10 10 10 10 10 32 32 33 33 33 33 33 2 2 2 2 2 2 2 49 49 49 48 48 49 49 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015 Intel Deutschland GmbH * Copyright (C) 2022-2024 Intel Corporation */ #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #include "driver-ops.h" #include "debugfs_sta.h" #include "debugfs_netdev.h" int drv_start(struct ieee80211_local *local) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(local->started)) return -EALREADY; trace_drv_start(local); local->started = true; /* allow rx frames */ smp_mb(); ret = local->ops->start(&local->hw); trace_drv_return_int(local, ret); if (ret) local->started = false; return ret; } void drv_stop(struct ieee80211_local *local, bool suspend) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->started)) return; trace_drv_stop(local, suspend); local->ops->stop(&local->hw, suspend); trace_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); tasklet_enable(&local->tasklet); barrier(); local->started = false; } int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; trace_drv_add_interface(local, sdata); ret = local->ops->add_interface(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); if (ret) return ret; if (!(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) { sdata->flags |= IEEE80211_SDATA_IN_DRIVER; drv_vif_add_debugfs(local, sdata); /* initially vif is not MLD */ ieee80211_link_debugfs_drv_add(&sdata->deflink); } return 0; } int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_change_interface(local, sdata, type, p2p); ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); trace_drv_return_int(local, ret); return ret; } void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; /* Remove driver debugfs entries */ ieee80211_debugfs_recreate_netdev(sdata, sdata->vif.valid_links); trace_drv_remove_interface(local, sdata); local->ops->remove_interface(&local->hw, &sdata->vif); trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); if (local->ops->sta_state) { ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, old_state, new_state); } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { ret = drv_sta_add(local, sdata, &sta->sta); if (ret == 0) { sta->uploaded = true; if (rcu_access_pointer(sta->sta.rates)) drv_sta_rate_tbl_update(local, sdata, &sta->sta); } } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTH) { drv_sta_remove(local, sdata, &sta->sta); } trace_drv_return_int(local, ret); return ret; } __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_set_txpwr(local, sdata, &sta->sta); if (local->ops->sta_set_txpwr) ret = local->ops->sta_set_txpwr(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_int(local, ret); return ret; } void drv_link_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_sta *link_sta, u32 changed) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); trace_drv_link_sta_rc_update(local, sdata, link_sta, changed); if (local->ops->link_sta_rc_update) local->ops->link_sta_rc_update(&local->hw, &sdata->vif, link_sta, changed); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_link_data *link, u16 ac, const struct ieee80211_tx_queue_params *params) { struct ieee80211_sub_if_data *sdata = link->sdata; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; if (params->cw_min == 0 || params->cw_min > params->cw_max) { /* * If we can't configure hardware anyway, don't warn. We may * never have initialized the CW parameters. */ WARN_ONCE(local->ops->conf_tx, "%s: invalid CW_min/CW_max: %d/%d\n", sdata->name, params->cw_min, params->cw_max); return -EINVAL; } trace_drv_conf_tx(local, sdata, link->link_id, ac, params); if (local->ops->conf_tx) ret = local->ops->conf_tx(&local->hw, &sdata->vif, link->link_id, ac, params); trace_drv_return_int(local, ret); return ret; } u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { u64 ret = -1ULL; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return ret; trace_drv_get_tsf(local, sdata); if (local->ops->get_tsf) ret = local->ops->get_tsf(&local->hw, &sdata->vif); trace_drv_return_u64(local, ret); return ret; } void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_set_tsf(local, sdata, tsf); if (local->ops->set_tsf) local->ops->set_tsf(&local->hw, &sdata->vif, tsf); trace_drv_return_void(local); } void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_offset_tsf(local, sdata, offset); if (local->ops->offset_tsf) local->ops->offset_tsf(&local->hw, &sdata->vif, offset); trace_drv_return_void(local); } void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_reset_tsf(local, sdata); if (local->ops->reset_tsf) local->ops->reset_tsf(&local->hw, &sdata->vif); trace_drv_return_void(local); } int drv_assign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); /* * We should perhaps push emulate chanctx down and only * make it call ->config() when the chanctx is actually * assigned here (and unassigned below), but that's yet * another change to all drivers to add assign/unassign * emulation callbacks. Maybe later. */ if (sdata->vif.type == NL80211_IFTYPE_MONITOR && local->emulate_chanctx && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return 0; if (!check_sdata_in_driver(sdata)) return -EIO; if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id)) return 0; trace_drv_assign_vif_chanctx(local, sdata, link_conf, ctx); if (local->ops->assign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); ret = local->ops->assign_vif_chanctx(&local->hw, &sdata->vif, link_conf, &ctx->conf); } trace_drv_return_int(local, ret); return ret; } void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_MONITOR && local->emulate_chanctx && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return; if (!check_sdata_in_driver(sdata)) return; if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id)) return; trace_drv_unassign_vif_chanctx(local, sdata, link_conf, ctx); if (local->ops->unassign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->unassign_vif_chanctx(&local->hw, &sdata->vif, link_conf, &ctx->conf); } trace_drv_return_void(local); } int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode) { int ret = 0; int i; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->switch_vif_chanctx) return -EOPNOTSUPP; for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); WARN_ON_ONCE(!old_ctx->driver_present); WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && new_ctx->driver_present) || (mode == CHANCTX_SWMODE_REASSIGN_VIF && !new_ctx->driver_present)); } trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); ret = local->ops->switch_vif_chanctx(&local->hw, vifs, n_vifs, mode); trace_drv_return_int(local, ret); if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); new_ctx->driver_present = true; old_ctx->driver_present = false; } } return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_ampdu_action(local, sdata, params); if (local->ops->ampdu_action) ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params); trace_drv_return_int(local, ret); return ret; } void drv_link_info_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info, int link_id, u64 changed) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED) && sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT && sdata->vif.type != NL80211_IFTYPE_OCB)) return; if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !sdata->vif.bss_conf.mu_mimo_owner && !(changed & BSS_CHANGED_TXPOWER)))) return; if (!check_sdata_in_driver(sdata)) return; if (!ieee80211_vif_link_active(&sdata->vif, link_id)) return; trace_drv_link_info_changed(local, sdata, info, changed); if (local->ops->link_info_changed) local->ops->link_info_changed(&local->hw, &sdata->vif, info, changed); else if (local->ops->bss_info_changed) local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed); trace_drv_return_void(local); } int drv_set_key(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; if (WARN_ON(key->link_id >= 0 && sdata->vif.active_links && !(sdata->vif.active_links & BIT(key->link_id)))) return -ENOLINK; trace_drv_set_key(local, cmd, sdata, sta, key); ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); trace_drv_return_int(local, ret); return ret; } int drv_change_vif_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]) { struct ieee80211_link_data *link; unsigned long links_to_add; unsigned long links_to_rem; unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (old_links == new_links) return 0; links_to_add = ~old_links & new_links; links_to_rem = old_links & ~new_links; for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { link = rcu_access_pointer(sdata->link[link_id]); ieee80211_link_debugfs_drv_remove(link); } trace_drv_change_vif_links(local, sdata, old_links, new_links); if (local->ops->change_vif_links) ret = local->ops->change_vif_links(&local->hw, &sdata->vif, old_links, new_links, old); trace_drv_return_int(local, ret); if (ret) return ret; if (!local->in_reconfig && !local->resuming) { for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { link = rcu_access_pointer(sdata->link[link_id]); ieee80211_link_debugfs_drv_add(link); } } return 0; } int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u16 old_links, u16 new_links) { struct sta_info *info = container_of(sta, struct sta_info, sta); struct link_sta_info *link_sta; unsigned long links_to_add; unsigned long links_to_rem; unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; old_links &= sdata->vif.active_links; new_links &= sdata->vif.active_links; if (old_links == new_links) return 0; links_to_add = ~old_links & new_links; links_to_rem = old_links & ~new_links; for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { link_sta = rcu_dereference_protected(info->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); ieee80211_link_sta_debugfs_drv_remove(link_sta); } trace_drv_change_sta_links(local, sdata, sta, old_links, new_links); if (local->ops->change_sta_links) ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta, old_links, new_links); trace_drv_return_int(local, ret); if (ret) return ret; /* during reconfig don't add it to debugfs again */ if (local->in_reconfig || local->resuming) return 0; for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { link_sta = rcu_dereference_protected(info->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); ieee80211_link_sta_debugfs_drv_add(link_sta); } return 0; } |
12 7 25 5 19 10 10 5 5 25 36 17 19 2 2 2 2 2 2 2 12 11 2 2 2 1 1 2 11 8 3 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CIPSO - Commercial IP Security Option * * This is an implementation of the CIPSO 2.2 protocol as specified in * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors * have chosen to adopt the protocol and over the years it has become a * de-facto standard for labeled networking. * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: * https://www.itl.nist.gov/fipspubs/fip188.htm * * Author: Paul Moore <paul.moore@hp.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/unaligned.h> /* List of available DOI definitions */ /* XXX - This currently assumes a minimal number of different DOIs in use, * if in practice there are a lot of different DOIs this list should * probably be turned into a hash table or something similar so we * can do quick lookups. */ static DEFINE_SPINLOCK(cipso_v4_doi_list_lock); static LIST_HEAD(cipso_v4_doi_list); /* Label mapping cache */ int cipso_v4_cache_enabled = 1; int cipso_v4_cache_bucketsize = 10; #define CIPSO_V4_CACHE_BUCKETBITS 7 #define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS) #define CIPSO_V4_CACHE_REORDERLIMIT 10 struct cipso_v4_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt; int cipso_v4_rbm_strictvalid = 1; /* * Protocol Constants */ /* Maximum size of the CIPSO IP option, derived from the fact that the maximum * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ #define CIPSO_V4_OPT_LEN_MAX 40 /* Length of the base CIPSO option, this includes the option type (1 byte), the * option length (1 byte), and the DOI (4 bytes). */ #define CIPSO_V4_HDR_LEN 6 /* Base length of the restrictive category bitmap tag (tag #1). */ #define CIPSO_V4_TAG_RBM_BLEN 4 /* Base length of the enumerated category tag (tag #2). */ #define CIPSO_V4_TAG_ENUM_BLEN 4 /* Base length of the ranged categories bitmap tag (tag #5). */ #define CIPSO_V4_TAG_RNG_BLEN 4 /* The maximum number of category ranges permitted in the ranged category tag * (tag #5). You may note that the IETF draft states that the maximum number * of category ranges is 7, but if the low end of the last category range is * zero then it is possible to fit 8 category ranges because the zero should * be omitted. */ #define CIPSO_V4_TAG_RNG_CAT_MAX 8 /* Base length of the local tag (non-standard tag). * Tag definition (may change between kernel versions) * * 0 8 16 24 32 * +----------+----------+----------+----------+ * | 10000000 | 00000110 | 32-bit secid value | * +----------+----------+----------+----------+ * | in (host byte order)| * +----------+----------+ * */ #define CIPSO_V4_TAG_LOC_BLEN 6 /* * Helper Functions */ /** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /* * Label Mapping Cache Functions */ /** * cipso_v4_cache_init - Initialize the CIPSO cache * * Description: * Initializes the CIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init cipso_v4_cache_init(void) { u32 iter; cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, sizeof(struct cipso_v4_map_cache_bkt), GFP_KERNEL); if (!cipso_v4_cache) return -ENOMEM; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_init(&cipso_v4_cache[iter].lock); cipso_v4_cache[iter].size = 0; INIT_LIST_HEAD(&cipso_v4_cache[iter].list); } return 0; } /** * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: * Invalidates and frees any entries in the CIPSO cache. * */ void cipso_v4_cache_invalidate(void) { struct cipso_v4_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_bh(&cipso_v4_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &cipso_v4_cache[iter].list, list) { list_del(&entry->list); cipso_v4_cache_entry_free(entry); } cipso_v4_cache[iter].size = 0; spin_unlock_bh(&cipso_v4_cache[iter].lock); } } /** * cipso_v4_cache_check - Check the CIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int cipso_v4_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct cipso_v4_map_cache_entry *entry; struct cipso_v4_map_cache_entry *prev_entry = NULL; u32 hash; if (!READ_ONCE(cipso_v4_cache_enabled)) return -ENOENT; hash = cipso_v4_map_cache_hash(key, key_len); bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CIPSOV4; if (!prev_entry) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CIPSO_V4_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return -ENOENT; } /** * cipso_v4_cache_add - Add an entry to the CIPSO cache * @cipso_ptr: pointer to CIPSO IP option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. * */ int cipso_v4_cache_add(const unsigned char *cipso_ptr, const struct netlbl_lsm_secattr *secattr) { int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize); int ret_val = -EPERM; u32 bkt; struct cipso_v4_map_cache_entry *entry = NULL; struct cipso_v4_map_cache_entry *old_entry = NULL; u32 cipso_ptr_len; if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0) return 0; cipso_ptr_len = cipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = cipso_ptr_len; entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); if (cipso_v4_cache[bkt].size < bkt_size) { list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache[bkt].size += 1; } else { old_entry = list_entry(cipso_v4_cache[bkt].list.prev, struct cipso_v4_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache_entry_free(old_entry); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; cache_add_failure: if (entry) cipso_v4_cache_entry_free(entry); return ret_val; } /* * DOI List Functions */ /** * cipso_v4_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) { struct cipso_v4_doi *iter; list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see cipso_ipv4.h for details). Returns * zero on success and non-zero on failure. * */ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 iter; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN) goto doi_add_return; for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: break; case CIPSO_V4_TAG_RANGE: case CIPSO_V4_TAG_ENUM: if (doi_def->type != CIPSO_V4_MAP_PASS) goto doi_add_return; break; case CIPSO_V4_TAG_LOCAL: if (doi_def->type != CIPSO_V4_MAP_LOCAL) goto doi_add_return; break; case CIPSO_V4_TAG_INVALID: if (iter == 0) goto doi_add_return; break; default: goto doi_add_return; } } refcount_set(&doi_def->refcount, 1); spin_lock(&cipso_v4_doi_list_lock); if (cipso_v4_doi_search(doi_def->doi)) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list); spin_unlock(&cipso_v4_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CIPSO_V4_MAP_TRANS: type_str = "trans"; break; case CIPSO_V4_MAP_PASS: type_str = "pass"; break; case CIPSO_V4_MAP_LOCAL: type_str = "local"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " cipso_doi=%u cipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) { if (!doi_def) return; switch (doi_def->type) { case CIPSO_V4_MAP_TRANS: kfree(doi_def->map.std->lvl.cipso); kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); kfree(doi_def->map.std); break; } kfree(doi_def); } /** * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void cipso_v4_doi_free_rcu(struct rcu_head *entry) { struct cipso_v4_doi *doi_def; doi_def = container_of(entry, struct cipso_v4_doi, rcu); cipso_v4_doi_free(doi_def); } /** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&cipso_v4_doi_list_lock); doi_def = cipso_v4_doi_search(doi); if (!doi_def) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); cipso_v4_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " cipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * rcu_read_lock() is held while accessing the returned definition and the DOI * definition reference count is decremented when the caller is done. * */ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) { struct cipso_v4_doi *doi_def; rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * cipso_v4_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from cipso_v4_doi_getdef(). * */ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; cipso_v4_cache_invalidate(); call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); } /** * cipso_v4_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ int cipso_v4_doi_walk(u32 *skip_cnt, int (*callback) (struct cipso_v4_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct cipso_v4_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /* * Label Mapping Functions */ /** * cipso_v4_map_lvl_valid - Checks to see if the given level is understood * @doi_def: the DOI definition * @level: the level to check * * Description: * Checks the given level against the given DOI definition and returns a * negative value if the level does not have a valid mapping and a zero value * if the level is defined by the DOI. * */ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: if ((level < doi_def->map.std->lvl.cipso_size) && (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)) return 0; break; } return -EFAULT; } /** * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network * @doi_def: the DOI definition * @host_lvl: the host MLS level * @net_lvl: the network/CIPSO MLS level * * Description: * Perform a label mapping to translate a local MLS level to the correct * CIPSO level using the given DOI definition. Returns zero on success, * negative values otherwise. * */ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def, u32 host_lvl, u32 *net_lvl) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *net_lvl = host_lvl; return 0; case CIPSO_V4_MAP_TRANS: if (host_lvl < doi_def->map.std->lvl.local_size && doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) { *net_lvl = doi_def->map.std->lvl.local[host_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host * @doi_def: the DOI definition * @net_lvl: the network/CIPSO MLS level * @host_lvl: the host MLS level * * Description: * Perform a label mapping to translate a CIPSO level to the correct local MLS * level using the given DOI definition. Returns zero on success, negative * values otherwise. * */ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def, u32 net_lvl, u32 *host_lvl) { struct cipso_v4_std_map_tbl *map_tbl; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *host_lvl = net_lvl; return 0; case CIPSO_V4_MAP_TRANS: map_tbl = doi_def->map.std; if (net_lvl < map_tbl->lvl.cipso_size && map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) { *host_lvl = doi_def->map.std->lvl.cipso[net_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid * @doi_def: the DOI definition * @bitmap: category bitmap * @bitmap_len: bitmap length in bytes * * Description: * Checks the given category bitmap against the given DOI definition and * returns a negative value if any of the categories in the bitmap do not have * a valid mapping and a zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, const unsigned char *bitmap, u32 bitmap_len) { int cat = -1; u32 bitmap_len_bits = bitmap_len * 8; u32 cipso_cat_size; u32 *cipso_array; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { cat = netlbl_bitmap_walk(bitmap, bitmap_len_bits, cat + 1, 1); if (cat < 0) break; if (cat >= cipso_cat_size || cipso_array[cat] >= CIPSO_V4_INV_CAT) return -EFAULT; } if (cat == -1) return 0; break; } return -EFAULT; } /** * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int host_spot = -1; u32 net_spot = CIPSO_V4_INV_CAT; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; u32 host_cat_size = 0; u32 *host_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { host_cat_size = doi_def->map.std->cat.local_size; host_cat_array = doi_def->map.std->cat.local; } for (;;) { host_spot = netlbl_catmap_walk(secattr->attr.mls.cat, host_spot + 1); if (host_spot < 0) break; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: net_spot = host_spot; break; case CIPSO_V4_MAP_TRANS: if (host_spot >= host_cat_size) return -EPERM; net_spot = host_cat_array[host_spot]; if (net_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } if (net_spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, net_spot, 1); if (net_spot > net_spot_max) net_spot_max = net_spot; } if (++net_spot_max % 8) return net_spot_max / 8 + 1; return net_spot_max / 8; } /** * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int net_spot = -1; u32 host_spot = CIPSO_V4_INV_CAT; u32 net_clen_bits = net_cat_len * 8; u32 net_cat_size = 0; u32 *net_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { net_cat_size = doi_def->map.std->cat.cipso_size; net_cat_array = doi_def->map.std->cat.cipso; } for (;;) { net_spot = netlbl_bitmap_walk(net_cat, net_clen_bits, net_spot + 1, 1); if (net_spot < 0) return 0; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: host_spot = net_spot; break; case CIPSO_V4_MAP_TRANS: if (net_spot >= net_cat_size) return -EPERM; host_spot = net_cat_array[net_spot]; if (host_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, host_spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @enumcat: category list * @enumcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def, const unsigned char *enumcat, u32 enumcat_len) { u16 cat; int cat_prev = -1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01) return -EFAULT; for (iter = 0; iter < enumcat_len; iter += 2) { cat = get_unaligned_be16(&enumcat[iter]); if (cat <= cat_prev) return -EFAULT; cat_prev = cat; } return 0; } /** * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int cat = -1; u32 cat_iter = 0; for (;;) { cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1); if (cat < 0) break; if ((cat_iter + 2) > net_cat_len) return -ENOSPC; *((__be16 *)&net_cat[cat_iter]) = htons(cat); cat_iter += 2; } return cat_iter; } /** * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; for (iter = 0; iter < net_cat_len; iter += 2) { ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, get_unaligned_be16(&net_cat[iter]), GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /** * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @rngcat: category list * @rngcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def, const unsigned char *rngcat, u32 rngcat_len) { u16 cat_high; u16 cat_low; u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01) return -EFAULT; for (iter = 0; iter < rngcat_len; iter += 4) { cat_high = get_unaligned_be16(&rngcat[iter]); if ((iter + 4) <= rngcat_len) cat_low = get_unaligned_be16(&rngcat[iter + 2]); else cat_low = 0; if (cat_high > cat_prev) return -EFAULT; cat_prev = cat_low; } return 0; } /** * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int iter = -1; u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; u32 array_cnt = 0; u32 cat_size = 0; /* make sure we don't overflow the 'array[]' variable */ if (net_cat_len > (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) return -ENOSPC; for (;;) { iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1); if (iter < 0) break; cat_size += (iter == 0 ? 0 : sizeof(u16)); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter); if (iter < 0) return -EFAULT; cat_size += sizeof(u16); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; } for (iter = 0; array_cnt > 0;) { *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]); iter += 2; array_cnt--; if (array[array_cnt] != 0) { *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]); iter += 2; } } return cat_size; } /** * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 net_iter; u16 cat_low; u16 cat_high; for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { cat_high = get_unaligned_be16(&net_cat[net_iter]); if ((net_iter + 4) <= net_cat_len) cat_low = get_unaligned_be16(&net_cat[net_iter + 2]); else cat_low = 0; ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat, cat_low, cat_high, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /* * Protocol Handling Functions */ /** * cipso_v4_gentag_hdr - Generate a CIPSO option header * @doi_def: the DOI definition * @len: the total tag length in bytes, not including this header * @buf: the CIPSO option buffer * * Description: * Write a CIPSO header into the beginning of @buffer. * */ static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def, unsigned char *buf, u32 len) { buf[0] = IPOPT_CIPSO; buf[1] = CIPSO_V4_HDR_LEN + len; put_unaligned_be32(doi_def->doi, &buf[2]); } /** * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The * actual buffer length may be larger than the indicated size due to * translation between host and network category bitmaps. Returns the size of * the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rbm_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; /* This will send packets using the "optimized" format when * possible as specified in section 3.4.2.6 of the * CIPSO draft. */ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 && ret_val <= 10) tag_len = 14; else tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RBITMAP; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_enum_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_ENUM; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO enumerated tag (tag type #2) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the ranged tag, tag type #5. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rng_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RANGE; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO ranged tag (tag type #5) and return the security attributes * in @secattr. Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the local tag. Returns the size of the tag * on success, negative values on failure. * */ static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { if (!(secattr->flags & NETLBL_SECATTR_SECID)) return -EPERM; buffer[0] = CIPSO_V4_TAG_LOCAL; buffer[1] = CIPSO_V4_TAG_LOC_BLEN; *(u32 *)&buffer[2] = secattr->attr.secid; return CIPSO_V4_TAG_LOC_BLEN; } /** * cipso_v4_parsetag_loc - Parse a CIPSO local tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO local tag and return the security attributes in @secattr. * Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { secattr->attr.secid = *(u32 *)&tag[2]; secattr->flags |= NETLBL_SECATTR_SECID; return 0; } /** * cipso_v4_optptr - Find the CIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CIPSO option. Returns a pointer * to the start of the CIPSO option on success, NULL if one is not found. * */ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]); int optlen; int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) { switch (optptr[0]) { case IPOPT_END: return NULL; case IPOPT_NOOP: taglen = 1; break; default: taglen = optptr[1]; } if (!taglen || taglen > optlen) return NULL; if (optptr[0] == IPOPT_CIPSO) return optptr; optlen -= taglen; optptr += taglen; } return NULL; } /** * cipso_v4_validate - Validate a CIPSO option * @skb: the packet * @option: the start of the option, on error it is set to point to the error * * Description: * This routine is called to validate a CIPSO option, it checks all of the * fields to ensure that they are at least valid, see the draft snippet below * for details. If the option is valid then a zero value is returned and * the value of @option is unchanged. If the option is invalid then a * non-zero value is returned and @option is adjusted to point to the * offending portion of the option. From the IETF draft ... * * "If any field within the CIPSO options, such as the DOI identifier, is not * recognized the IP datagram is discarded and an ICMP 'parameter problem' * (type 12) is generated and returned. The ICMP code field is set to 'bad * parameter' (code 0) and the pointer is set to the start of the CIPSO field * that is unrecognized." * */ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) { unsigned char *opt = *option; unsigned char *tag; unsigned char opt_iter; unsigned char err_offset = 0; u8 opt_len; u8 tag_len; struct cipso_v4_doi *doi_def = NULL; u32 tag_iter; /* caller already checks for length values that are too large */ opt_len = opt[1]; if (opt_len < 8) { err_offset = 1; goto validate_return; } rcu_read_lock(); doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2])); if (!doi_def) { err_offset = 2; goto validate_return_locked; } opt_iter = CIPSO_V4_HDR_LEN; tag = opt + opt_iter; while (opt_iter < opt_len) { for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];) if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID || ++tag_iter == CIPSO_V4_TAG_MAXCNT) { err_offset = opt_iter; goto validate_return_locked; } if (opt_iter + 1 == opt_len) { err_offset = opt_iter; goto validate_return_locked; } tag_len = tag[1]; if (tag_len > (opt_len - opt_iter)) { err_offset = opt_iter + 1; goto validate_return_locked; } switch (tag[0]) { case CIPSO_V4_TAG_RBITMAP: if (tag_len < CIPSO_V4_TAG_RBM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } /* We are already going to do all the verification * necessary at the socket layer so from our point of * view it is safe to turn these checks off (and less * work), however, the CIPSO draft says we should do * all the CIPSO validations here but it doesn't * really specify _exactly_ what we need to validate * ... so, just make it a sysctl tunable. */ if (READ_ONCE(cipso_v4_rbm_strictvalid)) { if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RBM_BLEN && cipso_v4_map_cat_rbm_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } } break; case CIPSO_V4_TAG_ENUM: if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_ENUM_BLEN && cipso_v4_map_cat_enum_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_RANGE: if (tag_len < CIPSO_V4_TAG_RNG_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RNG_BLEN && cipso_v4_map_cat_rng_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_LOCAL: /* This is a non-standard tag that we only allow for * local connections, so if the incoming interface is * not the loopback device drop the packet. Further, * there is no legitimate reason for setting this from * userspace so reject it if skb is NULL. */ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } if (tag_len != CIPSO_V4_TAG_LOC_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } break; default: err_offset = opt_iter; goto validate_return_locked; } tag += tag_len; opt_iter += tag_len; } validate_return_locked: rcu_read_unlock(); validate_return: *option = opt + err_offset; return err_offset; } /** * cipso_v4_error - Send the correct response for a bad packet * @skb: the packet * @error: the error code * @gateway: CIPSO gateway flag * * Description: * Based on the error code given in @error, send an ICMP error message back to * the originating host. From the IETF draft ... * * "If the contents of the CIPSO [option] are valid but the security label is * outside of the configured host or port label range, the datagram is * discarded and an ICMP 'destination unreachable' (type 3) is generated and * returned. The code field of the ICMP is set to 'communication with * destination network administratively prohibited' (code 9) or to * 'communication with destination host administratively prohibited' * (code 10). The value of the code is dependent on whether the originator * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The * recipient of the ICMP message MUST be able to handle either value. The * same procedure is performed if a CIPSO [option] can not be added to an * IP packet because it is too large to fit in the IP options area." * * "If the error is triggered by receipt of an ICMP message, the message is * discarded and no response is permitted (consistent with general ICMP * processing rules)." * */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; /* * We might be called above the IP layer, * so we can not use icmp_send and IPCB here. */ memset(opt, 0, sizeof(struct ip_options)); opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); rcu_read_lock(); res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); else __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); } /** * cipso_v4_genopt - Generate a CIPSO option * @buf: the option buffer * @buf_len: the size of opt_buf * @doi_def: the CIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CIPSO option using the DOI definition and security attributes * passed to the function. Returns the length of the option on success and * negative values on failure. * */ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; if (buf_len <= CIPSO_V4_HDR_LEN) return -ENOSPC; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ iter = 0; do { memset(buf, 0, buf_len); switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_gentag_rbm(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_gentag_enum(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_gentag_rng(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_gentag_loc(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; default: return -EPERM; } iter++; } while (ret_val < 0 && iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID); if (ret_val < 0) return ret_val; cipso_v4_gentag_hdr(doi_def, buf, ret_val); return CIPSO_V4_HDR_LEN + ret_val; } static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len) { int iter = 0, optlen = 0; /* determining the new total option length is tricky because of * the padding necessary, the only thing i can think to do at * this point is walk the options one-by-one, skipping the * padding at the end to determine the actual option size and * from there we can determine the new total option length */ while (iter < len) { if (data[iter] == IPOPT_END) { break; } else if (data[iter] == IPOPT_NOP) { iter++; } else { iter += data[iter + 1]; optlen = iter; } } return optlen; } /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * @sk_locked: true if caller holds the socket lock * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, bool sk_locked) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *old, *opt = NULL; struct inet_sock *sk_inet; struct inet_connection_sock *sk_conn; /* In the case of sock_create_lite(), the sock->sk field is not * defined yet but it is not a problem as the only users of these * "lite" PF_INET sockets are functions which do an accept() call * afterwards so we will label the socket as part of the accept(). */ if (!sk) return 0; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto socket_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto socket_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto socket_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; sk_inet = inet_sk(sk); old = rcu_dereference_protected(sk_inet->inet_opt, sk_locked); if (inet_test_bit(IS_ICSK, sk)) { sk_conn = inet_csk(sk); if (old) sk_conn->icsk_ext_hdr_len -= old->opt.optlen; sk_conn->icsk_ext_hdr_len += opt->opt.optlen; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } rcu_assign_pointer(sk_inet->inet_opt, opt); if (old) kfree_rcu(old, rcu); return 0; socket_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ int cipso_v4_req_setattr(struct request_sock *req, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *opt = NULL; struct inet_request_sock *req_inet; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto req_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto req_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto req_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; req_inet = inet_rsk(req); opt = unrcu_pointer(xchg(&req_inet->ireq_opt, RCU_INITIALIZER(opt))); if (opt) kfree_rcu(opt, rcu); return 0; req_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_delopt - Delete the CIPSO option from a set of IP options * @opt_ptr: IP option pointer * * Description: * Deletes the CIPSO IP option from a set of IP options and makes the necessary * adjustments to the IP option structure. Returns zero on success, negative * values on failure. * */ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) { struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); int hdr_delta = 0; if (!opt || opt->opt.cipso == 0) return 0; if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; int optlen_new; cipso_off = opt->opt.cipso - sizeof(struct iphdr); cipso_ptr = &opt->opt.__data[cipso_off]; cipso_len = cipso_ptr[1]; if (opt->opt.srr > opt->opt.cipso) opt->opt.srr -= cipso_len; if (opt->opt.rr > opt->opt.cipso) opt->opt.rr -= cipso_len; if (opt->opt.ts > opt->opt.cipso) opt->opt.ts -= cipso_len; if (opt->opt.router_alert > opt->opt.cipso) opt->opt.router_alert -= cipso_len; opt->opt.cipso = 0; memmove(cipso_ptr, cipso_ptr + cipso_len, opt->opt.optlen - cipso_off - cipso_len); optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data, opt->opt.optlen); hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; } else { /* only the cipso option was present on the socket so we can * remove the entire option struct */ *opt_ptr = NULL; hdr_delta = opt->opt.optlen; kfree_rcu(opt, rcu); } return hdr_delta; } /** * cipso_v4_sock_delattr - Delete the CIPSO option from a socket * @sk: the socket * * Description: * Removes the CIPSO option from a socket, if present. * */ void cipso_v4_sock_delattr(struct sock *sk) { struct inet_sock *sk_inet; int hdr_delta; sk_inet = inet_sk(sk); hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } } /** * cipso_v4_req_delattr - Delete the CIPSO option from a request socket * @req: the request socket * * Description: * Removes the CIPSO option from a request socket, if present. * */ void cipso_v4_req_delattr(struct request_sock *req) { cipso_v4_delopt(&inet_rsk(req)->ireq_opt); } /** * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions * @cipso: the CIPSO v4 option * @secattr: the security attributes * * Description: * Inspect @cipso and return the security attributes in @secattr. Returns zero * on success and negative values on failure. * */ int cipso_v4_getattr(const unsigned char *cipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi; struct cipso_v4_doi *doi_def; if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(&cipso[2]); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto getattr_return; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ switch (cipso[6]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr); break; } if (ret_val == 0) secattr->type = NETLBL_NLTYPE_CIPSOV4; getattr_return: rcu_read_unlock(); return ret_val; } /** * cipso_v4_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CIPSO option attached to the sock and if * there is return the CIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ip_options_rcu *opt; int res = -ENOMSG; rcu_read_lock(); opt = rcu_dereference(inet_sk(sk)->inet_opt); if (opt && opt->opt.cipso) res = cipso_v4_getattr(opt->opt.__data + opt->opt.cipso - sizeof(struct iphdr), secattr); rcu_read_unlock(); return res; } /** * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet * @skb: the packet * @doi_def: the DOI structure * @secattr: the security attributes * * Description: * Set the CIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ int cipso_v4_skbuff_setattr(struct sk_buff *skb, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char buf[CIPSO_V4_OPT_LEN_MAX]; u32 buf_len = CIPSO_V4_OPT_LEN_MAX; u32 opt_len; int len_delta; ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) return ret_val; buf_len = ret_val; opt_len = (buf_len + 3) & ~3; /* we overwrite any existing options to ensure that we have enough * room for the CIPSO option, the reason is that we _need_ to guarantee * that the security label is applied to the packet - we do the same * thing when using the socket options and it hasn't caused a problem, * if we need to we can always revisit this choice later */ len_delta = opt_len - opt->optlen; /* if we don't ensure enough headroom we could panic on the skb_push() * call below so make sure we have enough, we are also "mangling" the * packet so we should probably do a copy-on-write call anyway */ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; if (len_delta > 0) { /* we assume that the header + opt->optlen have already been * "pushed" in ip_options_build() or similar */ iph = ip_hdr(skb); skb_push(skb, len_delta); memmove((char *)iph - len_delta, iph, iph->ihl << 2); skb_reset_network_header(skb); iph = ip_hdr(skb); } else if (len_delta < 0) { iph = ip_hdr(skb); memset(iph + 1, IPOPT_NOP, opt->optlen); } else iph = ip_hdr(skb); if (opt->optlen > 0) memset(opt, 0, sizeof(*opt)); opt->optlen = opt_len; opt->cipso = sizeof(struct iphdr); opt->is_changed = 1; /* we have to do the following because we are being called from a * netfilter hook which means the packet already has had the header * fields populated and the checksum calculated - yes this means we * are doing more work than needed but we do it to keep the core * stack clean and tidy */ memcpy(iph + 1, buf, buf_len); if (opt_len > buf_len) memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); if (len_delta != 0) { iph->ihl = 5 + (opt_len >> 2); iph_set_totlen(iph, skb->len); } ip_send_check(iph); return 0; } /** * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ int cipso_v4_skbuff_delattr(struct sk_buff *skb) { int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len, hdr_len_delta; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char *cipso_ptr; if (opt->cipso == 0) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; iph = ip_hdr(skb); cipso_ptr = (unsigned char *)iph + opt->cipso; cipso_len = cipso_ptr[1]; hdr_len_actual = sizeof(struct iphdr) + cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1), opt->optlen); new_hdr_len_actual = hdr_len_actual - cipso_len; new_hdr_len = (new_hdr_len_actual + 3) & ~3; hdr_len_delta = (iph->ihl << 2) - new_hdr_len; /* 1. shift any options after CIPSO to the left */ memmove(cipso_ptr, cipso_ptr + cipso_len, new_hdr_len_actual - opt->cipso); /* 2. move the whole IP header to its new place */ memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual); /* 3. adjust the skb layout */ skb_pull(skb, hdr_len_delta); skb_reset_network_header(skb); iph = ip_hdr(skb); /* 4. re-fill new padding with IPOPT_END (may now be longer) */ memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END, new_hdr_len - new_hdr_len_actual); opt->optlen -= hdr_len_delta; opt->cipso = 0; opt->is_changed = 1; if (hdr_len_delta != 0) { iph->ihl = new_hdr_len >> 2; iph_set_totlen(iph, skb->len); } ip_send_check(iph); return 0; } /* * Setup Functions */ /** * cipso_v4_init - Initialize the CIPSO module * * Description: * Initialize the CIPSO module and prepare it for use. Returns zero on success * and negative values on failure. * */ static int __init cipso_v4_init(void) { int ret_val; ret_val = cipso_v4_cache_init(); if (ret_val != 0) panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n", ret_val); return 0; } subsys_initcall(cipso_v4_init); |
8 8 3 3 1 3 3 3 1 3 3 4 5 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/module.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfctrl.h> #include <net/caif/cfmuxl.h> #include <net/caif/cffrml.h> #include <net/caif/cfserl.h> #include <net/caif/cfsrvl.h> #include <net/caif/caif_dev.h> #define container_obj(layr) container_of(layr, struct cfcnfg, layer) /* Information about CAIF physical interfaces held by Config Module in order * to manage physical interfaces */ struct cfcnfg_phyinfo { struct list_head node; bool up; /* Pointer to the layer below the MUX (framing layer) */ struct cflayer *frm_layer; /* Pointer to the lowest actual physical layer */ struct cflayer *phy_layer; /* Unique identifier of the physical interface */ unsigned int id; /* Preference of the physical in interface */ enum cfcnfg_phy_preference pref; /* Information about the physical device */ struct dev_info dev_info; /* Interface index */ int ifindex; /* Protocol head room added for CAIF link layer */ int head_room; /* Use Start of frame checksum */ bool use_fcs; }; struct cfcnfg { struct cflayer layer; struct cflayer *ctrl; struct cflayer *mux; struct list_head phys; struct mutex lock; }; static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer); static void cfctrl_resp_func(void); static void cfctrl_enum_resp(void); struct cfcnfg *cfcnfg_create(void) { struct cfcnfg *this; struct cfctrl_rsp *resp; might_sleep(); /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) return NULL; this->mux = cfmuxl_create(); if (!this->mux) goto out_of_mem; this->ctrl = cfctrl_create(); if (!this->ctrl) goto out_of_mem; /* Initiate response functions */ resp = cfctrl_get_respfuncs(this->ctrl); resp->enum_rsp = cfctrl_enum_resp; resp->linkerror_ind = cfctrl_resp_func; resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp; resp->sleep_rsp = cfctrl_resp_func; resp->wake_rsp = cfctrl_resp_func; resp->restart_rsp = cfctrl_resp_func; resp->radioset_rsp = cfctrl_resp_func; resp->linksetup_rsp = cfcnfg_linkup_rsp; resp->reject_rsp = cfcnfg_reject_rsp; INIT_LIST_HEAD(&this->phys); cfmuxl_set_uplayer(this->mux, this->ctrl, 0); layer_set_dn(this->ctrl, this->mux); layer_set_up(this->ctrl, this); mutex_init(&this->lock); return this; out_of_mem: synchronize_rcu(); kfree(this->mux); kfree(this->ctrl); kfree(this); return NULL; } void cfcnfg_remove(struct cfcnfg *cfg) { might_sleep(); if (cfg) { synchronize_rcu(); kfree(cfg->mux); cfctrl_remove(cfg->ctrl); kfree(cfg); } } static void cfctrl_resp_func(void) { } static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, u8 phyid) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->id == phyid) return phy; return NULL; } static void cfctrl_enum_resp(void) { } static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg, enum cfcnfg_phy_preference phy_pref) { /* Try to match with specified preference */ struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) { if (phy->up && phy->pref == phy_pref && phy->frm_layer != NULL) return &phy->dev_info; } /* Otherwise just return something */ list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->up) return &phy->dev_info; return NULL; } static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->ifindex == ifi && phy->up) return phy->id; return -ENODEV; } int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) { u8 channel_id; struct cfcnfg *cfg = get_cfcnfg(net); caif_assert(adap_layer != NULL); cfctrl_cancel_req(cfg->ctrl, adap_layer); channel_id = adap_layer->id; if (channel_id != 0) { struct cflayer *servl; servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); if (servl != NULL) layer_set_up(servl, NULL); } else pr_debug("nothing to disconnect\n"); /* Do RCU sync before initiating cleanup */ synchronize_rcu(); if (adap_layer->ctrlcmd != NULL) adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0); return 0; } EXPORT_SYMBOL(caif_disconnect_client); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id) { } static const int protohead[CFCTRL_SRV_MASK] = { [CFCTRL_SRV_VEI] = 4, [CFCTRL_SRV_DATAGRAM] = 7, [CFCTRL_SRV_UTIL] = 4, [CFCTRL_SRV_RFM] = 3, [CFCTRL_SRV_DBG] = 3, }; static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, struct caif_connect_request *s, struct cfctrl_link_param *l) { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; int res; memset(l, 0, sizeof(*l)); /* In caif protocol low value is high priority */ l->priority = CAIF_PRIO_MAX - s->priority + 1; if (s->ifindex != 0) { res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); if (res < 0) return res; l->phyid = res; } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: return -EINVAL; } dev_info = cfcnfg_get_phyid(cnfg, pref); if (dev_info == NULL) return -ENODEV; l->phyid = dev_info->id; } switch (s->protocol) { case CAIFPROTO_AT: l->linktype = CFCTRL_SRV_VEI; l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3; l->chtype = s->sockaddr.u.at.type & 0x3; break; case CAIFPROTO_DATAGRAM: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_DATAGRAM_LOOP: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x03; l->endpoint = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; strscpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; strscpy(l->u.utility.name, s->sockaddr.u.util.service, sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) l->u.utility.paramlen = sizeof(l->u.utility.params); memcpy(l->u.utility.params, s->param.data, l->u.utility.paramlen); break; case CAIFPROTO_DEBUG: l->linktype = CFCTRL_SRV_DBG; l->endpoint = s->sockaddr.u.dbg.service; l->chtype = s->sockaddr.u.dbg.type; break; default: return -EINVAL; } return 0; } int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, struct cflayer *adap_layer, int *ifindex, int *proto_head, int *proto_tail) { struct cflayer *frml; struct cfcnfg_phyinfo *phy; int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, ¶m); if (err) goto unlock; phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid); if (!phy) { err = -ENODEV; goto unlock; } err = -EINVAL; if (adap_layer == NULL) { pr_err("adap_layer is zero\n"); goto unlock; } if (adap_layer->receive == NULL) { pr_err("adap_layer->receive is NULL\n"); goto unlock; } if (adap_layer->ctrlcmd == NULL) { pr_err("adap_layer->ctrlcmd == NULL\n"); goto unlock; } err = -ENODEV; frml = phy->frm_layer; if (frml == NULL) { pr_err("Specified PHY type does not exist!\n"); goto unlock; } caif_assert(param.phyid == phy->id); caif_assert(phy->frm_layer->id == param.phyid); caif_assert(phy->phy_layer->id == param.phyid); *ifindex = phy->ifindex; *proto_tail = 2; *proto_head = protohead[param.linktype] + phy->head_room; rcu_read_unlock(); /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */ cfctrl_enum_req(cfg->ctrl, param.phyid); return cfctrl_linkup_request(cfg->ctrl, ¶m, adap_layer); unlock: rcu_read_unlock(); return err; } EXPORT_SYMBOL(caif_connect_client); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer) { if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); } static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer) { struct cfcnfg *cnfg = container_obj(layer); struct cflayer *servicel = NULL; struct cfcnfg_phyinfo *phyinfo; struct net_device *netdev; if (channel_id == 0) { pr_warn("received channel_id zero\n"); if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); return; } rcu_read_lock(); if (adapt_layer == NULL) { pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); goto unlock; } caif_assert(cnfg != NULL); caif_assert(phyid != 0); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { pr_err("ERROR: Link Layer Device disappeared while connecting\n"); goto unlock; } caif_assert(phyinfo != NULL); caif_assert(phyinfo->id == phyid); caif_assert(phyinfo->phy_layer != NULL); caif_assert(phyinfo->phy_layer->id == phyid); adapt_layer->id = channel_id; switch (serv) { case CFCTRL_SRV_VEI: servicel = cfvei_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DATAGRAM: servicel = cfdgml_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_RFM: netdev = phyinfo->dev_info.dev; servicel = cfrfml_create(channel_id, &phyinfo->dev_info, netdev->mtu); break; case CFCTRL_SRV_UTIL: servicel = cfutill_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_VIDEO: servicel = cfvidl_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DBG: servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: pr_err("Protocol error. Link setup response - unknown channel type\n"); goto unlock; } if (!servicel) goto unlock; layer_set_dn(servicel, cnfg->mux); cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id); layer_set_up(servicel, adapt_layer); layer_set_dn(adapt_layer, servicel); rcu_read_unlock(); servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0); return; unlock: rcu_read_unlock(); } int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, struct cflayer *link_support, bool fcs, int head_room) { struct cflayer *frml; struct cfcnfg_phyinfo *phyinfo = NULL; int i, res = 0; u8 phyid; mutex_lock(&cnfg->lock); /* CAIF protocol allow maximum 6 link-layers */ for (i = 0; i < 7; i++) { phyid = (dev->ifindex + i) & 0x7; if (phyid == 0) continue; if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL) goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); res = -EEXIST; goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; goto out; } phy_layer->id = phyid; phyinfo->pref = pref; phyinfo->id = phyid; phyinfo->dev_info.id = phyid; phyinfo->dev_info.dev = dev; phyinfo->phy_layer = phy_layer; phyinfo->ifindex = dev->ifindex; phyinfo->head_room = head_room; phyinfo->use_fcs = fcs; frml = cffrml_create(phyid, fcs); if (!frml) { res = -ENOMEM; goto out_err; } phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); if (link_support != NULL) { link_support->id = phyid; layer_set_dn(frml, link_support); layer_set_up(link_support, frml); layer_set_dn(link_support, phy_layer); layer_set_up(phy_layer, link_support); } else { layer_set_dn(frml, phy_layer); layer_set_up(phy_layer, frml); } list_add_rcu(&phyinfo->node, &cnfg->phys); out: mutex_unlock(&cnfg->lock); return res; out_err: kfree(phyinfo); mutex_unlock(&cnfg->lock); return res; } EXPORT_SYMBOL(cfcnfg_add_phy_layer); int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, bool up) { struct cfcnfg_phyinfo *phyinfo; rcu_read_lock(); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id); if (phyinfo == NULL) { rcu_read_unlock(); return -ENODEV; } if (phyinfo->up == up) { rcu_read_unlock(); return 0; } phyinfo->up = up; if (up) { cffrml_hold(phyinfo->frm_layer); cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer, phy_layer->id); } else { cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id); cffrml_put(phyinfo->frm_layer); } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(cfcnfg_set_phy_state); int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer) { struct cflayer *frml, *frml_dn; u16 phyid; struct cfcnfg_phyinfo *phyinfo; might_sleep(); mutex_lock(&cnfg->lock); phyid = phy_layer->id; phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { mutex_unlock(&cnfg->lock); return 0; } caif_assert(phyid == phyinfo->id); caif_assert(phy_layer == phyinfo->phy_layer); caif_assert(phy_layer->id == phyid); caif_assert(phyinfo->frm_layer->id == phyid); list_del_rcu(&phyinfo->node); synchronize_rcu(); /* Fail if reference count is not zero */ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) { pr_info("Wait for device inuse\n"); list_add_rcu(&phyinfo->node, &cnfg->phys); mutex_unlock(&cnfg->lock); return -EAGAIN; } frml = phyinfo->frm_layer; frml_dn = frml->dn; cffrml_set_uplayer(frml, NULL); cffrml_set_dnlayer(frml, NULL); if (phy_layer != frml_dn) { layer_set_up(frml_dn, NULL); layer_set_dn(frml_dn, NULL); } layer_set_up(phy_layer, NULL); if (phyinfo->phy_layer != frml_dn) kfree(frml_dn); cffrml_free(frml); kfree(phyinfo); mutex_unlock(&cnfg->lock); return 0; } EXPORT_SYMBOL(cfcnfg_del_phy_layer); |
2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | // SPDX-License-Identifier: GPL-2.0-only /* * ebt_nflog * * Author: * Peter Warasin <peter@endian.com> * * February, 2008 * * Based on: * xt_NFLOG.c, (C) 2006 by Patrick McHardy <kaber@trash.net> * ebt_ulog.c, (C) 2004 by Bart De Schuymer <bdschuym@pandora.be> * */ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_nflog.h> #include <net/netfilter/nf_log.h> static unsigned int ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nflog_info *info = par->targinfo; struct net *net = xt_net(par); struct nf_loginfo li; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; li.u.ulog.flags = 0; nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par), xt_out(par), &li, "%s", info->prefix); return EBT_CONTINUE; } static int ebt_nflog_tg_check(const struct xt_tgchk_param *par) { struct ebt_nflog_info *info = par->targinfo; if (info->flags & ~EBT_NFLOG_MASK) return -EINVAL; info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0'; return 0; } static struct xt_target ebt_nflog_tg_reg __read_mostly = { .name = "nflog", .revision = 0, .family = NFPROTO_BRIDGE, .target = ebt_nflog_tg, .checkentry = ebt_nflog_tg_check, .targetsize = sizeof(struct ebt_nflog_info), .me = THIS_MODULE, }; static int __init ebt_nflog_init(void) { return xt_register_target(&ebt_nflog_tg_reg); } static void __exit ebt_nflog_fini(void) { xt_unregister_target(&ebt_nflog_tg_reg); } module_init(ebt_nflog_init); module_exit(ebt_nflog_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Peter Warasin <peter@endian.com>"); MODULE_DESCRIPTION("ebtables NFLOG netfilter logging module"); |
2 203 204 2 2 2 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | // SPDX-License-Identifier: GPL-2.0 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { ASSERT_RTNL(); info->family = AF_INET; /* Paired with READ_ONCE() in fib4_seq_read() */ WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1); return call_fib_notifiers(net, event_type, info); } static unsigned int fib4_seq_read(const struct net *net) { /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); } static int fib4_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib4_rules_dump(net, nb, extack); if (err) return err; return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { .family = AF_INET, .fib_seq_read = fib4_seq_read, .fib_dump = fib4_dump, .owner = THIS_MODULE, }; int __net_init fib4_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.fib_seq = 0; ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.notifier_ops = ops; return 0; } void __net_exit fib4_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.notifier_ops); } |
3 3 2 5 3 1 1 5 1 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. */ #include <linux/if_arp.h> #include <linux/init.h> #include <linux/slab.h> #include <net/x25.h> LIST_HEAD(x25_route_list); DEFINE_RWLOCK(x25_route_list_lock); /* * Add a new route. */ static int x25_add_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits) goto out; } rt = kmalloc(sizeof(*rt), GFP_ATOMIC); rc = -ENOMEM; if (!rt) goto out; strcpy(rt->address.x25_addr, "000000000000000"); memcpy(rt->address.x25_addr, address->x25_addr, sigdigits); rt->sigdigits = sigdigits; rt->dev = dev; refcount_set(&rt->refcnt, 1); list_add(&rt->node, &x25_route_list); rc = 0; out: write_unlock_bh(&x25_route_list_lock); return rc; } /** * __x25_remove_route - remove route from x25_route_list * @rt: route to remove * * Remove route from x25_route_list. If it was there. * Caller must hold x25_route_list_lock. */ static void __x25_remove_route(struct x25_route *rt) { if (rt->node.next) { list_del(&rt->node); x25_route_put(rt); } } static int x25_del_route(struct x25_address *address, unsigned int sigdigits, struct net_device *dev) { struct x25_route *rt; int rc = -EINVAL; write_lock_bh(&x25_route_list_lock); list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, address, sigdigits) && rt->sigdigits == sigdigits && rt->dev == dev) { __x25_remove_route(rt); rc = 0; break; } } write_unlock_bh(&x25_route_list_lock); return rc; } /* * A device has been removed, remove its routes. */ void x25_route_device_down(struct net_device *dev) { struct x25_route *rt; struct list_head *entry, *tmp; write_lock_bh(&x25_route_list_lock); list_for_each_safe(entry, tmp, &x25_route_list) { rt = list_entry(entry, struct x25_route, node); if (rt->dev == dev) __x25_remove_route(rt); } write_unlock_bh(&x25_route_list_lock); } /* * Check that the device given is a valid X.25 interface that is "up". */ struct net_device *x25_dev_get(char *devname) { struct net_device *dev = dev_get_by_name(&init_net, devname); if (dev && (!(dev->flags & IFF_UP) || dev->type != ARPHRD_X25)) { dev_put(dev); dev = NULL; } return dev; } /** * x25_get_route - Find a route given an X.25 address. * @addr: - address to find a route for * * Find a route given an X.25 address. */ struct x25_route *x25_get_route(struct x25_address *addr) { struct x25_route *rt, *use = NULL; read_lock_bh(&x25_route_list_lock); list_for_each_entry(rt, &x25_route_list, node) { if (!memcmp(&rt->address, addr, rt->sigdigits)) { if (!use) use = rt; else if (rt->sigdigits > use->sigdigits) use = rt; } } if (use) x25_route_hold(use); read_unlock_bh(&x25_route_list_lock); return use; } /* * Handle the ioctls that control the routing functions. */ int x25_route_ioctl(unsigned int cmd, void __user *arg) { struct x25_route_struct rt; struct net_device *dev; int rc = -EINVAL; if (cmd != SIOCADDRT && cmd != SIOCDELRT) goto out; rc = -EFAULT; if (copy_from_user(&rt, arg, sizeof(rt))) goto out; rc = -EINVAL; if (rt.sigdigits > 15) goto out; dev = x25_dev_get(rt.device); if (!dev) goto out; if (cmd == SIOCADDRT) rc = x25_add_route(&rt.address, rt.sigdigits, dev); else rc = x25_del_route(&rt.address, rt.sigdigits, dev); dev_put(dev); out: return rc; } /* * Release all memory associated with X.25 routing structures. */ void __exit x25_route_free(void) { struct x25_route *rt; struct list_head *entry, *tmp; write_lock_bh(&x25_route_list_lock); list_for_each_safe(entry, tmp, &x25_route_list) { rt = list_entry(entry, struct x25_route, node); __x25_remove_route(rt); } write_unlock_bh(&x25_route_list_lock); } |
4 4 12 1 2 4 5 1 5 1 3 3 3 1 3 1 3 2 8 8 8 8 7 8 2 3 4 7 4 3 5 2 8 2 1 1 3 3 1 2 3 1 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> #include <linux/btf_ids.h> #include <linux/buildid.h> #include "percpu_freelist.h" #include "mmap_unlock_work.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; u64 data[]; }; struct bpf_stack_map { struct bpf_map map; void *elems; struct pcpu_freelist freelist; u32 n_buckets; struct stack_map_bucket *buckets[] __counted_by(n_buckets); }; static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); } static inline int stack_map_data_size(struct bpf_map *map) { return stack_map_use_build_id(map) ? sizeof(struct bpf_stack_build_id) : sizeof(u64); } static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, smap->map.numa_node); if (!smap->elems) return -ENOMEM; err = pcpu_freelist_init(&smap->freelist); if (err) goto free_elems; pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, smap->map.max_entries); return 0; free_elems: bpf_map_area_free(smap->elems); return err; } /* Called from syscall */ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; u64 cost, n_buckets; int err; if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8) return ERR_PTR(-EINVAL); BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); if (attr->map_flags & BPF_F_STACK_BUILD_ID) { if (value_size % sizeof(struct bpf_stack_build_id) || value_size / sizeof(struct bpf_stack_build_id) > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2; roundup_pow_of_two() can overflow * into UB on 32-bit arches, so check that first */ if (attr->max_entries > 1UL << 31) return ERR_PTR(-E2BIG); n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; return &smap->map; put_buffers: put_callchain_buffers(); free_smap: bpf_map_area_free(smap); return ERR_PTR(err); } static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault) { return may_fault ? build_id_parse(vma, build_id, NULL) : build_id_parse_nofault(vma, build_id, NULL); } /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: * - either adjusted in place to a file offset, if build ID fetching * succeeds; in this case id_offs[i].build_id is set to correct build ID, * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID; * - or IP will be kept intact, if build ID fetching failed; in this case * id_offs[i].build_id is zeroed out and id_offs[i].status is set to * BPF_STACK_BUILD_ID_IP. */ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); struct vm_area_struct *vma, *prev_vma = NULL; const char *prev_build_id; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); if (range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); goto build_id_valid; } vma = find_vma(current->mm, ip); if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } build_id_valid: id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; prev_vma = vma; prev_build_id = id_offs[i].build_id; } bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is * necessary to fix this mismatch. */ if (__BITS_PER_LONG != 64) { unsigned long *from = (unsigned long *) entry->ip; u64 *to = entry->ip; int i; /* copy data from the end to avoid using extra buffer */ for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } put_callchain_entry(rctx); return entry; #else /* CONFIG_STACKTRACE */ return NULL; #endif } static long __bpf_get_stackid(struct bpf_map *map, struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); hash_matches = bucket && bucket->hash == hash; /* fast cmp */ if (hash_matches && flags & BPF_F_FAST_STACK_CMP) return id; if (stack_map_use_build_id(map)) { struct bpf_stack_build_id *id_offs; /* for build_id+offset, pop a bucket before slow cmp */ new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; new_bucket->nr = trace_nr; id_offs = (struct bpf_stack_build_id *)new_bucket->data; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; } if (bucket && !(flags & BPF_F_REUSE_STACKID)) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return -EEXIST; } } else { if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, ips, trace_len) == 0) return id; if (bucket && !(flags & BPF_F_REUSE_STACKID)) return -EEXIST; new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; memcpy(new_bucket->data, ips, trace_len); } new_bucket->hash = hash; new_bucket->nr = trace_nr; old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return id; } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; max_depth += skip; if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ return -EFAULT; return __bpf_get_stackid(map, trace, flags); } const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static __u64 count_kernel_ip(struct perf_callchain_entry *trace) { __u64 nr_kernel = 0; while (nr_kernel < trace->nr) { if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) break; nr_kernel++; } return nr_kernel; } BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, struct bpf_map *, map, u64, flags) { struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; __u64 nr_kernel; int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; user = flags & BPF_F_USER_STACK; kernel = !user; trace = ctx->data->callchain; if (unlikely(!trace)) return -EFAULT; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) return -EFAULT; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } return ret; } const struct bpf_func_proto bpf_get_stackid_proto_pe = { .func = bpf_get_stackid_pe, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags, bool may_fault) { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; int err = -EINVAL; u64 *ips; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; if (kernel && user_build_id) goto clear; elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64); if (unlikely(size % elem_size)) goto clear; /* cannot get valid user stack for task without user_mode regs */ if (task && user && !user_mode(regs)) goto err_fault; /* get_perf_callchain does not support crosstask user stack walking * but returns an empty stack instead of NULL. */ if (crosstask && user) { err = -EOPNOTSUPP; goto clear; } num_elem = size / elem_size; max_depth = num_elem + skip; if (sysctl_perf_event_max_stack < max_depth) max_depth = sysctl_perf_event_max_stack; if (may_fault) rcu_read_lock(); /* need RCU for perf's callchain below */ if (trace_in) trace = trace_in; else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) rcu_read_unlock(); goto err_fault; } trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; if (user_build_id) { struct bpf_stack_build_id *id_offs = buf; u32 i; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; } else { memcpy(buf, ips, copy_len); } /* trace/ips should not be dereferenced after this point */ if (may_fault) rcu_read_unlock(); if (user_build_id) stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); if (size > copy_len) memset(buf + copy_len, 0, size - copy_len); return copy_len; err_fault: err = -EFAULT; clear: memset(buf, 0, size); return err; } BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */); } const struct bpf_func_proto bpf_get_stack_sleepable_proto = { .func = bpf_get_stack_sleepable, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags, bool may_fault) { struct pt_regs *regs; long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); if (regs) res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault); put_task_stack(task); return res; } BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf, u32, size, u64, flags) { return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */); } const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = { .func = bpf_get_task_stack_sleepable, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = (struct pt_regs *)(ctx->regs); struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; int err = -EINVAL; __u64 nr_kernel; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; user = flags & BPF_F_USER_STACK; kernel = !user; err = -EFAULT; trace = ctx->data->callchain; if (unlikely(!trace)) goto clear; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); } return err; clear: memset(buf, 0, size); return err; } const struct bpf_func_proto bpf_get_stack_proto_pe = { .func = bpf_get_stack_pe, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; u32 id = *(u32 *)key, trace_len; if (unlikely(id >= smap->n_buckets)) return -ENOENT; bucket = xchg(&smap->buckets[id], NULL); if (!bucket) return -ENOENT; trace_len = bucket->nr * stack_map_data_size(map); memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u32 id; WARN_ON_ONCE(!rcu_read_lock_held()); if (!key) { id = 0; } else { id = *(u32 *)key; if (id >= smap->n_buckets || !smap->buckets[id]) id = 0; else id++; } while (id < smap->n_buckets && !smap->buckets[id]) id++; if (id >= smap->n_buckets) return -ENOENT; *(u32 *)next_key = id; return 0; } static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ static long stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; u32 id = *(u32 *)key; if (unlikely(id >= smap->n_buckets)) return -E2BIG; old_bucket = xchg(&smap->buckets[id], NULL); if (old_bucket) { pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } else { return -ENOENT; } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); put_callchain_buffers(); } static u64 stack_map_mem_usage(const struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u64 value_size = map->value_size; u64 n_buckets = smap->n_buckets; u64 enties = map->max_entries; u64 usage = sizeof(*smap); usage += n_buckets * sizeof(struct stack_map_bucket *); usage += enties * (sizeof(struct stack_map_bucket) + value_size); return usage; } BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = stack_map_mem_usage, .map_btf_id = &stack_trace_map_btf_ids[0], }; |
4 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ip6t_entry *e = par->entryinfo; int err; if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || e->ipv6.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv6_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg6_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg6, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg6_check, .destroy = synproxy_tg6_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg6_init(void) { return xt_register_target(&synproxy_tg6_reg); } static void __exit synproxy_tg6_exit(void) { xt_unregister_target(&synproxy_tg6_reg); } module_init(synproxy_tg6_init); module_exit(synproxy_tg6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies"); |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 264 262 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/blkdev.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/kthread.h> #include <linux/backing-dev.h> #include <linux/blk-cgroup.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> #include <trace/events/writeback.h> #include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); static u64 bdi_id_cursor; static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> struct wb_stats { unsigned long nr_dirty; unsigned long nr_io; unsigned long nr_more_io; unsigned long nr_dirty_time; unsigned long nr_writeback; unsigned long nr_reclaimable; unsigned long nr_dirtied; unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh; }; static struct dentry *bdi_debug_root; static void bdi_debug_init(void) { bdi_debug_root = debugfs_create_dir("bdi", NULL); } static void collect_wb_stats(struct wb_stats *stats, struct bdi_writeback *wb) { struct inode *inode; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) stats->nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) stats->nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); stats->nr_written += wb_stat(wb, WB_WRITTEN); stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); } #ifdef CONFIG_CGROUP_WRITEBACK static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { struct bdi_writeback *wb; rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { if (!wb_tryget(wb)) continue; collect_wb_stats(stats, wb); wb_put(wb); } rcu_read_unlock(); } #else static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { collect_wb_stats(stats, &bdi->wb); } #endif static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct wb_stats stats; unsigned long tot_bw; global_dirty_limits(&background_thresh, &dirty_thresh); memset(&stats, 0, sizeof(stats)); stats.dirty_thresh = dirty_thresh; bdi_collect_stats(bdi, &stats); tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" "BdiDirtyThresh: %10lu kB\n" "DirtyThresh: %10lu kB\n" "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", K(stats.nr_writeback), K(stats.nr_reclaimable), K(stats.wb_thresh), K(dirty_thresh), K(background_thresh), K(stats.nr_dirtied), K(stats.nr_written), K(tot_bw), stats.nr_dirty, stats.nr_io, stats.nr_more_io, stats.nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, struct wb_stats *stats) { seq_printf(m, "WbCgIno: %10lu\n" "WbWriteback: %10lu kB\n" "WbReclaimable: %10lu kB\n" "WbDirtyThresh: %10lu kB\n" "WbDirtied: %10lu kB\n" "WbWritten: %10lu kB\n" "WbWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "state: %10lx\n\n", #ifdef CONFIG_CGROUP_WRITEBACK cgroup_ino(wb->memcg_css->cgroup), #else 1ul, #endif K(stats->nr_writeback), K(stats->nr_reclaimable), K(stats->wb_thresh), K(stats->nr_dirtied), K(stats->nr_written), K(wb->avg_write_bandwidth), stats->nr_dirty, stats->nr_io, stats->nr_more_io, stats->nr_dirty_time, wb->state); } static int cgwb_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct bdi_writeback *wb; global_dirty_limits(&background_thresh, &dirty_thresh); rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { struct wb_stats stats = { .dirty_thresh = dirty_thresh }; if (!wb_tryget(wb)) continue; collect_wb_stats(&stats, wb); /* * Calculate thresh of wb in writeback cgroup which is min of * thresh in global domain and thresh in cgroup domain. Drop * rcu lock because cgwb_calc_thresh may sleep in * cgroup_rstat_flush. We can do so here because we have a ref. */ if (mem_cgroup_wb_domain(wb)) { rcu_read_unlock(); stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); rcu_read_lock(); } wb_stats_show(m, wb, &stats); wb_put(wb); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, &cgwb_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } #else /* CONFIG_DEBUG_FS */ static inline void bdi_debug_init(void) { } static inline void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } #endif /* CONFIG_DEBUG_FS */ static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned long read_ahead_kb; ssize_t ret; ret = kstrtoul(buf, 10, &read_ahead_kb); if (ret < 0) return ret; bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); return count; } #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio_fine, bdi->max_ratio) static ssize_t min_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); } static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(min_bytes); static ssize_t max_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); } static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; } static ssize_t strict_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%d\n", !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); } static DEVICE_ATTR_RW(strict_limit); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); static const struct class bdi_class = { .name = "bdi", .dev_groups = bdi_dev_groups, }; static __init int bdi_class_init(void) { int ret; ret = class_register(&bdi_class); if (ret) return ret; bdi_debug_init(); return 0; } postcore_initcall(bdi_class_init); static int __init default_bdi_init(void) { bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; return 0; } subsys_initcall(default_bdi_init); static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, bw_dwork); wb_update_bandwidth(wb); } /* * Initial write bandwidth: 100 MB/s */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { int err; memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); err = fprop_local_init_percpu(&wb->completions, gfp); if (err) return err; err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); if (err) fprop_local_destroy_percpu(&wb->completions); return err; } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); /* * Remove bdi from the global list and shutdown any threads we have running */ static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ spin_lock_irq(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_irq(&wb->work_lock); return; } spin_unlock_irq(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) { WARN_ON(delayed_work_pending(&wb->dwork)); percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/memcontrol.h> /* * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); static void cgwb_free_rcu(struct rcu_head *rcu_head) { struct bdi_writeback *wb = container_of(rcu_head, struct bdi_writeback, rcu); percpu_ref_exit(&wb->refcnt); kfree(wb); } static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) { lockdep_assert_held(&cgwb_lock); WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { spin_lock_irq(&cgwb_lock); list_del_rcu(&wb->bdi_node); spin_unlock_irq(&cgwb_lock); } static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; int ret = 0; memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { cgwb_kill(wb); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); if (wb) goto out_put; /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); if (!wb) { ret = -ENOMEM; goto out_put; } ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); if (ret) goto err_wb_exit; ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); if (ret) goto err_ref_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate * whether they're still online. Don't link @wb if any is dead. * See wb_memcg_offline() and wb_blkcg_offline(). */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); if (test_bit(WB_registered, &bdi->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } } spin_unlock_irqrestore(&cgwb_lock, flags); if (ret) { if (ret == -EEXIST) ret = 0; goto err_fprop_exit; } goto out_put; err_fprop_exit: bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: wb_exit(wb); err_free: kfree(wb); out_put: css_put(blkcg_css); return ret; } /** * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * * Try to get the wb for @memcg_css on @bdi. The returned wb has its * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on * @memcg_css isn't enough. try_get it before calling this function. * * A wb is keyed by its associated memcg. As blkcg implicitly enables * memcg on the default hierarchy, memcg association is guaranteed to be * more specific (equal or descendant to the associated blkcg) and thus can * identify both the memcg and blkcg associations. * * Because the blkcg associated with a memcg may change as blkcg is enabled * and disabled closer to root in the hierarchy, each wb keeps track of * both the memcg and blkcg associated with it and verifies the blkcg on * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) return &bdi->wb; rcu_read_lock(); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); } rcu_read_unlock(); return wb; } /** * wb_get_create - get wb for a given memcg, create if necessary * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * @gfp: allocation mask to use * * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct bdi_writeback *wb; might_alloc(gfp); do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; } static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); mutex_unlock(&bdi->cgwb_release_mutex); } /* * cleanup_offline_cgwbs_workfn - try to release dying cgwbs * * Try to release dying cgwbs by switching attached inodes to the nearest * living ancestor's writeback. Processed wbs are placed at the end * of the list to guarantee the forward progress. */ static void cleanup_offline_cgwbs_workfn(struct work_struct *work) { struct bdi_writeback *wb; LIST_HEAD(processed); spin_lock_irq(&cgwb_lock); while (!list_empty(&offline_cgwbs)) { wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, offline_node); list_move(&wb->offline_node, &processed); /* * If wb is dirty, cleaning up the writeback by switching * attached inodes will result in an effective removal of any * bandwidth restrictions, which isn't the goal. Instead, * it can be postponed until the next time, when all io * will be likely completed. If in the meantime some inodes * will get re-dirtied, they should be eventually switched to * a new cgwb. */ if (wb_has_dirty_io(wb)) continue; if (!wb_tryget(wb)) continue; spin_unlock_irq(&cgwb_lock); while (cleanup_offline_cgwb(wb)) cond_resched(); spin_lock_irq(&cgwb_lock); wb_put(wb); } if (!list_empty(&processed)) list_splice_tail(&processed, &offline_cgwbs); spin_unlock_irq(&cgwb_lock); } /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined * * Also prevents creation of any new wb's associated with @memcg. */ void wb_memcg_offline(struct mem_cgroup *memcg) { struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); spin_unlock_irq(&cgwb_lock); } static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming * system_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); if (!cgwb_release_wq) return -ENOMEM; return 0; } subsys_initcall(cgwb_init); #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) { return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { list_del_rcu(&wb->bdi_node); } #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { kfree(bdi); return NULL; } bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; struct rb_node *parent = NULL; struct backing_dev_info *bdi; lockdep_assert_held(&bdi_lock); while (*p) { parent = *p; bdi = rb_entry(parent, struct backing_dev_info, rb_node); if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right; else break; } if (parentp) *parentp = parent; return p; } /** * bdi_get_by_id - lookup and get bdi from its id * @id: bdi id to lookup * * Find bdi matching @id and get it. Returns NULL if the matching bdi * doesn't exist or is already unregistered. */ struct backing_dev_info *bdi_get_by_id(u64 id) { struct backing_dev_info *bdi = NULL; struct rb_node **p; spin_lock_bh(&bdi_lock); p = bdi_lookup_rb_node(id, NULL); if (*p) { bdi = rb_entry(*p, struct backing_dev_info, rb_node); bdi_get(bdi); } spin_unlock_bh(&bdi_lock); return bdi; } int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); bdi->id = ++bdi_id_cursor; p = bdi_lookup_rb_node(bdi->id, &parent); rb_link_node(&bdi->rb_node, parent, p); rb_insert_color(&bdi->rb_node, &bdi_tree); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); return 0; } int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = bdi_register_va(bdi, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL(bdi_register); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); } /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); } void bdi_unregister(struct backing_dev_info *bdi) { del_timer_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi); /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to * update the global bdi_min_ratio. */ if (bdi->min_ratio) bdi_set_min_ratio(bdi, 0); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } if (bdi->owner) { put_device(bdi->owner); bdi->owner = NULL; } } EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); } void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } EXPORT_SYMBOL(bdi_put); struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb; if (!inode) return &noop_backing_dev_info; sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } EXPORT_SYMBOL(inode_to_bdi); const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); |
9 13 13 2 2 1 1 3 3 1 1 1 13 13 1 12 4 12 5 1 1 1 1 1 4 4 1 2 1 4 1 1 1 1 3 3 1 1 1 1 1 3 8 2 1 1 7 1 1 1 1 1 1 1 3 1 1 1 1 1 14 12 2 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 | // SPDX-License-Identifier: GPL-2.0-only /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/mISDNif.h> #include <linux/slab.h> #include <linux/export.h> #include "core.h" static u_int *debug; static struct proto mISDN_proto = { .name = "misdn", .owner = THIS_MODULE, .obj_size = sizeof(struct mISDN_sock) }; #define _pms(sk) ((struct mISDN_sock *)sk) static struct mISDN_sock_list data_sockets = { .lock = __RW_LOCK_UNLOCKED(data_sockets.lock) }; static struct mISDN_sock_list base_sockets = { .lock = __RW_LOCK_UNLOCKED(base_sockets.lock) }; #define L2_HEADER_LEN 4 static inline struct sk_buff * _l2_alloc_skb(unsigned int len, gfp_t gfp_mask) { struct sk_buff *skb; skb = alloc_skb(len + L2_HEADER_LEN, gfp_mask); if (likely(skb)) skb_reserve(skb, L2_HEADER_LEN); return skb; } static void mISDN_sock_link(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_add_node(sk, &l->head); write_unlock_bh(&l->lock); } static void mISDN_sock_unlink(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_del_node_init(sk); write_unlock_bh(&l->lock); } static int mISDN_send(struct mISDNchannel *ch, struct sk_buff *skb) { struct mISDN_sock *msk; int err; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s len %d %p\n", __func__, skb->len, skb); if (msk->sk.sk_state == MISDN_CLOSED) return -EUNATCH; __net_timestamp(skb); err = sock_queue_rcv_skb(&msk->sk, skb); if (err) printk(KERN_WARNING "%s: error %d\n", __func__, err); return err; } static int mISDN_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { struct mISDN_sock *msk; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %x, %p)\n", __func__, ch, cmd, arg); switch (cmd) { case CLOSE_CHANNEL: msk->sk.sk_state = MISDN_CLOSED; break; } return 0; } static inline void mISDN_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct __kernel_old_timeval tv; if (_pms(sk)->cmask & MISDN_TIME_STAMP) { skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_MISDN, MISDN_TIME_STAMP, sizeof(tv), &tv); } } static int mISDN_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sk_buff *skb; struct sock *sk = sock->sk; int copied, err; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d, flags %x ch.nr %d, proto %x\n", __func__, (int)len, flags, _pms(sk)->ch.nr, sk->sk_protocol); if (flags & (MSG_OOB)) return -EOPNOTSUPP; if (sk->sk_state == MISDN_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) { maddr->channel = (mISDN_HEAD_ID(skb) >> 16) & 0xff; maddr->tei = (mISDN_HEAD_ID(skb) >> 8) & 0xff; maddr->sapi = mISDN_HEAD_ID(skb) & 0xff; } else { maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xFF; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xFF; } msg->msg_namelen = sizeof(*maddr); } copied = skb->len + MISDN_HEADER_LEN; if (len < copied) { if (flags & MSG_PEEK) refcount_dec(&skb->users); else skb_queue_head(&sk->sk_receive_queue, skb); return -ENOSPC; } memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); skb_free_datagram(sk, skb); return err ? : copied; } static int mISDN_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = -ENOMEM; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d flags %x ch %d proto %x\n", __func__, (int)len, msg->msg_flags, _pms(sk)->ch.nr, sk->sk_protocol); if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE)) return -EINVAL; if (len < MISDN_HEADER_LEN) return -EINVAL; if (sk->sk_state != MISDN_BOUND) return -EBADFD; lock_sock(sk); skb = _l2_alloc_skb(len, GFP_KERNEL); if (!skb) goto done; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto done; } memcpy(mISDN_HEAD_P(skb), skb->data, MISDN_HEADER_LEN); skb_pull(skb, MISDN_HEADER_LEN); if (msg->msg_namelen >= sizeof(struct sockaddr_mISDN)) { /* if we have a address, we use it */ DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); mISDN_HEAD_ID(skb) = maddr->channel; } else { /* use default for L2 messages */ if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr; } if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: ID:%x\n", __func__, mISDN_HEAD_ID(skb)); err = -ENODEV; if (!_pms(sk)->ch.peer) goto done; err = _pms(sk)->ch.recv(_pms(sk)->ch.peer, skb); if (err) goto done; else { skb = NULL; err = len; } done: kfree_skb(skb); release_sock(sk); return err; } static int data_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: if (sk->sk_state == MISDN_BOUND) delete_channel(&_pms(sk)->ch); else mISDN_sock_unlink(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: delete_channel(&_pms(sk)->ch); mISDN_sock_unlink(&data_sockets, sk); break; } lock_sock(sk); sock_orphan(sk); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p) { struct mISDN_ctrl_req cq; int err = -EINVAL, val[2]; struct mISDNchannel *bchan, *next; lock_sock(sk); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } switch (cmd) { case IMCTRLREQ: if (copy_from_user(&cq, p, sizeof(cq))) { err = -EFAULT; break; } if ((sk->sk_protocol & ~ISDN_P_B_MASK) == ISDN_P_B_START) { list_for_each_entry_safe(bchan, next, &_pms(sk)->dev->bchannels, list) { if (bchan->nr == cq.channel) { err = bchan->ctrl(bchan, CONTROL_CHANNEL, &cq); break; } } } else err = _pms(sk)->dev->D.ctrl(&_pms(sk)->dev->D, CONTROL_CHANNEL, &cq); if (err) break; if (copy_to_user(p, &cq, sizeof(cq))) err = -EFAULT; break; case IMCLEAR_L2: if (sk->sk_protocol != ISDN_P_LAPD_NT) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; case IMHOLD_L1: if (sk->sk_protocol != ISDN_P_LAPD_NT && sk->sk_protocol != ISDN_P_LAPD_TE) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; default: err = -EINVAL; break; } done: release_sock(sk); return err; } static int data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct sock *sk = sock->sk; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; default: if (sk->sk_state == MISDN_BOUND) err = data_sock_ioctl_bound(sk, cmd, (void __user *)arg); else err = -ENOTCONN; } return err; } static int data_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, optval, %d)\n", __func__, sock, level, optname, optlen); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; else _pms(sk)->cmask &= ~MISDN_TIME_STAMP; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int data_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, opt; if (get_user(len, optlen)) return -EFAULT; if (len != sizeof(char)) return -EINVAL; switch (optname) { case MISDN_TIME_STAMP: if (_pms(sk)->cmask & MISDN_TIME_STAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) return -EFAULT; break; default: return -ENOPROTOOPT; } return 0; } static int data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; struct sock *csk; int err = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (addr_len != sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } if (sk->sk_protocol < ISDN_P_B_START) { read_lock_bh(&data_sockets.lock); sk_for_each(csk, &data_sockets.head) { if (sk == csk) continue; if (_pms(csk)->dev != _pms(sk)->dev) continue; if (csk->sk_protocol >= ISDN_P_B_START) continue; if (IS_ISDN_P_TE(csk->sk_protocol) == IS_ISDN_P_TE(sk->sk_protocol)) continue; read_unlock_bh(&data_sockets.lock); err = -EBUSY; goto done; } read_unlock_bh(&data_sockets.lock); } _pms(sk)->ch.send = mISDN_send; _pms(sk)->ch.ctrl = mISDN_ctrl; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: mISDN_sock_unlink(&data_sockets, sk); err = connect_layer1(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); if (err) mISDN_sock_link(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: err = create_l2entity(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = connect_Bstack(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; default: err = -EPROTONOSUPPORT; } if (err) goto done; sk->sk_state = MISDN_BOUND; _pms(sk)->ch.protocol = sk->sk_protocol; done: release_sock(sk); return err; } static int data_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; if (!_pms(sk)->dev) return -EBADFD; lock_sock(sk); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xff; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff; release_sock(sk); return sizeof(*maddr); } static const struct proto_ops data_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = data_sock_release, .ioctl = data_sock_ioctl, .bind = data_sock_bind, .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, .getsockopt = data_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int data_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &data_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&data_sockets, sk); return 0; } static int base_sock_release(struct socket *sock) { struct sock *sk = sock->sk; printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; mISDN_sock_unlink(&base_sockets, sk); sock_orphan(sk); sock_put(sk); return 0; } static int base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; case IMSETDEVNAME: { struct mISDN_devrename dn; if (copy_from_user(&dn, (void __user *)arg, sizeof(dn))) { err = -EFAULT; break; } dn.name[sizeof(dn.name) - 1] = '\0'; dev = get_mdevice(dn.id); if (dev) err = device_rename(&dev->dev, dn.name); else err = -ENODEV; } break; default: err = -EINVAL; } return err; } static int base_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; int err = 0; if (addr_len < sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } sk->sk_state = MISDN_BOUND; done: release_sock(sk); return err; } static const struct proto_ops base_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = base_sock_release, .ioctl = base_sock_ioctl, .bind = base_sock_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (!capable(CAP_NET_RAW)) return -EPERM; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &base_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&base_sockets, sk); return 0; } static int mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int err = -EPROTONOSUPPORT; switch (proto) { case ISDN_P_BASE: err = base_sock_create(net, sock, proto, kern); break; case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = data_sock_create(net, sock, proto, kern); break; default: return err; } return err; } static const struct net_proto_family mISDN_sock_family_ops = { .owner = THIS_MODULE, .family = PF_ISDN, .create = mISDN_sock_create, }; int misdn_sock_init(u_int *deb) { int err; debug = deb; err = sock_register(&mISDN_sock_family_ops); if (err) printk(KERN_ERR "%s: error(%d)\n", __func__, err); return err; } void misdn_sock_cleanup(void) { sock_unregister(PF_ISDN); } |
128 129 123 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 | /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a * range type is done by examining the immutable tree flag for the * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0x??1 : Root * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 * 0x010 : 32 bit values, type in 0-2, slot in 3-6 * 0x110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; unsigned char gap; }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) /** * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__min: minimum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each_rev(__mas, __entry, __min) \ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */ |
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/auth.c * * Generic RPC client authentication API. * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #include <linux/types.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/hash.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/spinlock.h> #include <trace/events/sunrpc.h> #define RPC_CREDCACHE_DEFAULT_HASHBITS (4) struct rpc_cred_cache { struct hlist_head *hashtable; unsigned int hashbits; spinlock_t lock; }; static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, [RPC_AUTH_TLS] = (const struct rpc_authops __force __rcu *)&authtls_ops, }; static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; static struct cred machine_cred = { .usage = ATOMIC_INIT(1), }; /* * Return the machine_cred pointer to be used whenever * the a generic machine credential is needed. */ const struct cred *rpc_machine_cred(void) { return &machine_cred; } EXPORT_SYMBOL_GPL(rpc_machine_cred); #define MAX_HASHTABLE_BITS (14) static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) { unsigned long num; unsigned int nbits; int ret; if (!val) goto out_inval; ret = kstrtoul(val, 0, &num); if (ret) goto out_inval; nbits = fls(num - 1); if (nbits > MAX_HASHTABLE_BITS || nbits < 2) goto out_inval; *(unsigned int *)kp->arg = nbits; return 0; out_inval: return -EINVAL; } static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) { unsigned int nbits; nbits = *(unsigned int *)kp->arg; return sprintf(buffer, "%u\n", 1U << nbits); } #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); static const struct kernel_param_ops param_ops_hashtbl_sz = { .set = param_set_hashtbl_sz, .get = param_get_hashtbl_sz, }; module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); static unsigned long auth_max_cred_cachesize = ULONG_MAX; module_param(auth_max_cred_cachesize, ulong, 0644); MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size"); static u32 pseudoflavor_to_flavor(u32 flavor) { if (flavor > RPC_AUTH_MAXFLAVOR) return RPC_AUTH_GSS; return flavor; } int rpcauth_register(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); if (old == NULL || old == ops) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); if (old == ops || old == NULL) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); static const struct rpc_authops * rpcauth_get_authops(rpc_authflavor_t flavor) { const struct rpc_authops *ops; if (flavor >= RPC_AUTH_MAXFLAVOR) return NULL; rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) { rcu_read_unlock(); request_module("rpc-auth-%u", flavor); rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) goto out; } if (!try_module_get(ops->owner)) ops = NULL; out: rcu_read_unlock(); return ops; } static void rpcauth_put_authops(const struct rpc_authops *ops) { module_put(ops->owner); } /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor * @info: a GSS mech OID, quality of protection, and service value * * Verifies that an appropriate kernel module is available or already loaded. * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is * not supported locally. */ rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; if (!ops) return RPC_AUTH_MAXFLAVOR; pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); /** * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor * @pseudoflavor: GSS pseudoflavor to match * @info: rpcsec_gss_info structure to fill in * * Returns zero and fills in "info" if pseudoflavor matches a * supported mechanism. */ int rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) { rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor); const struct rpc_authops *ops; int result; ops = rpcauth_get_authops(flavor); if (ops == NULL) return -ENOENT; result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); ops = rpcauth_get_authops(flavor); if (ops == NULL) goto out; auth = ops->create(args, clnt); rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) rpcauth_release(clnt->cl_auth); clnt->cl_auth = auth; out: return auth; } EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); /* * On success, the caller is responsible for freeing the reference * held by the hashtable */ static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; hlist_del_rcu(&cred->cr_hash); return true; } static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; bool ret; if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } /* * Initialize RPC credential cache */ int rpcauth_init_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *new; unsigned int hashsize; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out_nocache; new->hashbits = auth_hashbits; hashsize = 1U << new->hashbits; new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); if (!new->hashtable) goto out_nohashtbl; spin_lock_init(&new->lock); auth->au_credcache = new; return 0; out_nohashtbl: kfree(new); out_nocache: return -ENOMEM; } EXPORT_SYMBOL_GPL(rpcauth_init_credcache); char * rpcauth_stringify_acceptor(struct rpc_cred *cred) { if (!cred->cr_ops->crstringify_acceptor) return NULL; return cred->cr_ops->crstringify_acceptor(cred); } EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor); /* * Destroy a list of credentials */ static inline void rpcauth_destroy_credlist(struct list_head *head) { struct rpc_cred *cred; while (!list_empty(head)) { cred = list_entry(head->next, struct rpc_cred, cr_lru); list_del_init(&cred->cr_lru); put_rpccred(cred); } } static void rpcauth_lru_add_locked(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; number_cred_unused++; list_add_tail(&cred->cr_lru, &cred_unused); } static void rpcauth_lru_add(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_add_locked(cred); spin_unlock(&rpc_credcache_lock); } static void rpcauth_lru_remove_locked(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; number_cred_unused--; list_del_init(&cred->cr_lru); } static void rpcauth_lru_remove(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_remove_locked(cred); spin_unlock(&rpc_credcache_lock); } /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. */ void rpcauth_clear_credcache(struct rpc_cred_cache *cache) { LIST_HEAD(free); struct hlist_head *head; struct rpc_cred *cred; unsigned int hashsize = 1U << cache->hashbits; int i; spin_lock(&rpc_credcache_lock); spin_lock(&cache->lock); for (i = 0; i < hashsize; i++) { head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); rpcauth_unhash_cred_locked(cred); /* Note: We now hold a reference to cred */ rpcauth_lru_remove_locked(cred); list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); } /* * Destroy the RPC credential cache */ void rpcauth_destroy_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *cache = auth->au_credcache; if (cache) { auth->au_credcache = NULL; rpcauth_clear_credcache(cache); kfree(cache->hashtable); kfree(cache); } } EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); #define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ) /* * Remove stale credentials. Avoid sleeping inside the loop. */ static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { if (nr_to_scan-- == 0) break; if (refcount_read(&cred->cr_count) > 1) { rpcauth_lru_remove_locked(cred); continue; } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ if (time_in_range(cred->cr_expire, expired, jiffies)) continue; if (!rpcauth_unhash_cred(cred)) continue; rpcauth_lru_remove_locked(cred); freed++; list_add_tail(&cred->cr_lru, free); } return freed ? freed : SHRINK_STOP; } static unsigned long rpcauth_cache_do_shrink(int nr_to_scan) { LIST_HEAD(free); unsigned long freed; spin_lock(&rpc_credcache_lock); freed = rpcauth_prune_expired(&free, nr_to_scan); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); return freed; } /* * Run memory cache shrinker. */ static unsigned long rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; /* nothing left, don't come back */ if (list_empty(&cred_unused)) return SHRINK_STOP; return rpcauth_cache_do_shrink(sc->nr_to_scan); } static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return number_cred_unused * sysctl_vfs_cache_pressure / 100; } static void rpcauth_cache_enforce_limit(void) { unsigned long diff; unsigned int nr_to_scan; if (number_cred_unused <= auth_max_cred_cachesize) return; diff = number_cred_unused - auth_max_cred_cachesize; nr_to_scan = 100; if (diff < nr_to_scan) nr_to_scan = diff; rpcauth_cache_do_shrink(nr_to_scan); } /* * Look up a process' credentials in the authentication cache */ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, int flags, gfp_t gfp) { LIST_HEAD(free); struct rpc_cred_cache *cache = auth->au_credcache; struct rpc_cred *cred = NULL, *entry, *new; unsigned int nr; nr = auth->au_ops->hash_cred(acred, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } rcu_read_unlock(); if (cred != NULL) goto found; new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; goto out; } spin_lock(&cache->lock); hlist_for_each_entry(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); spin_unlock(&cache->lock); rpcauth_cache_enforce_limit(); found: if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && cred->cr_ops->cr_init != NULL && !(flags & RPCAUTH_LOOKUP_NEW)) { int res = cred->cr_ops->cr_init(auth, cred); if (res < 0) { put_rpccred(cred); cred = ERR_PTR(res); } } rpcauth_destroy_credlist(&free); out: return cred; } EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *auth, int flags) { struct auth_cred acred; struct rpc_cred *ret; const struct cred *cred = current_cred(); memset(&acred, 0, sizeof(acred)); acred.cred = cred; ret = auth->au_ops->lookup_cred(auth, &acred, flags); return ret; } EXPORT_SYMBOL_GPL(rpcauth_lookupcred); void rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, struct rpc_auth *auth, const struct rpc_credops *ops) { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_flags = 0; cred->cr_ops = ops; cred->cr_expire = jiffies; cred->cr_cred = get_cred(acred->cred); } EXPORT_SYMBOL_GPL(rpcauth_init_cred); static struct rpc_cred * rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = get_task_cred(&init_task), }; struct rpc_cred *ret; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); put_cred(acred.cred); return ret; } static struct rpc_cred * rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .principal = task->tk_client->cl_principal, .cred = init_task.cred, }; if (!acred.principal) return NULL; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } static struct rpc_cred * rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; return rpcauth_lookupcred(auth, lookupflags); } static int rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *new = NULL; int lookupflags = 0; struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = cred, }; if (flags & RPC_TASK_ASYNC) lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC; if (task->tk_op_cred) /* Task must use exactly this rpc_cred */ new = get_rpccred(task->tk_op_cred); else if (cred != NULL && cred != &machine_cred) new = auth->au_ops->lookup_cred(auth, &acred, lookupflags); else if (cred == &machine_cred) new = rpcauth_bind_machine_cred(task, lookupflags); /* If machine cred couldn't be bound, try a root cred */ if (new) ; else if (cred == &machine_cred) new = rpcauth_bind_root_cred(task, lookupflags); else if (flags & RPC_TASK_NULLCREDS) new = authnull_ops.lookup_cred(NULL, NULL, 0); else new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) return PTR_ERR(new); put_rpccred(req->rq_cred); req->rq_cred = new; return 0; } void put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; rcu_read_lock(); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; if (refcount_read(&cred->cr_count) != 1 || !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) goto out; if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { cred->cr_expire = jiffies; rpcauth_lru_add(cred); /* Race breaker */ if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) rpcauth_lru_remove(cred); } else if (rpcauth_unhash_cred(cred)) { rpcauth_lru_remove(cred); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; } out: rcu_read_unlock(); return; destroy: rcu_read_unlock(); cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); /** * rpcauth_marshcred - Append RPC credential to end of @xdr * @task: controlling RPC task * @xdr: xdr_stream containing initial portion of RPC Call header * * On success, an appropriate verifier is added to @xdr, @xdr is * updated to point past the verifier, and zero is returned. * Otherwise, @xdr is in an undefined state and a negative errno * is returned. */ int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crmarshal(task, xdr); } /** * rpcauth_wrap_req_encode - XDR encode the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message. * Otherwise, @xdr is in an undefined state. */ int rpcauth_wrap_req_encode(struct rpc_task *task, struct xdr_stream *xdr) { kxdreproc_t encode = task->tk_msg.rpc_proc->p_encode; encode(task->tk_rqstp, xdr, task->tk_msg.rpc_argp); return 0; } EXPORT_SYMBOL_GPL(rpcauth_wrap_req_encode); /** * rpcauth_wrap_req - XDR encode and wrap the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message, * and zero is returned. Otherwise, @xdr is in an undefined * state and a negative errno is returned. */ int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crwrap_req(task, xdr); } /** * rpcauth_checkverf - Validate verifier in RPC Reply header * @task: controlling RPC task * @xdr: xdr_stream containing RPC Reply header * * Return values: * %0: Verifier is valid. @xdr now points past the verifier. * %-EIO: Verifier is corrupted or message ended early. * %-EACCES: Verifier is intact but not valid. * %-EPROTONOSUPPORT: Server does not support the requested auth type. * * When a negative errno is returned, @xdr is left in an undefined * state. */ int rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crvalidate(task, xdr); } /** * rpcauth_unwrap_resp_decode - Invoke XDR decode function * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp_decode(struct rpc_task *task, struct xdr_stream *xdr) { kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode; return decode(task->tk_rqstp, xdr, task->tk_msg.rpc_resp); } EXPORT_SYMBOL_GPL(rpcauth_unwrap_resp_decode); /** * rpcauth_unwrap_resp - Invoke unwrap and decode function for the cred * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crunwrap_resp(task, xdr); } bool rpcauth_xmit_need_reencode(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (!cred || !cred->cr_ops->crneed_reencode) return false; return cred->cr_ops->crneed_reencode(task); } int rpcauth_refreshcred(struct rpc_task *task) { struct rpc_cred *cred; int err; cred = task->tk_rqstp->rq_cred; if (cred == NULL) { err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags); if (err < 0) goto out; cred = task->tk_rqstp->rq_cred; } err = cred->cr_ops->crrefresh(task); out: if (err < 0) task->tk_status = err; return err; } void rpcauth_invalcred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (cred) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); } int rpcauth_uptodatecred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; return cred == NULL || test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } static struct shrinker *rpc_cred_shrinker; int __init rpcauth_init_module(void) { int err; err = rpc_init_authunix(); if (err < 0) goto out1; rpc_cred_shrinker = shrinker_alloc(0, "sunrpc_cred"); if (!rpc_cred_shrinker) { err = -ENOMEM; goto out2; } rpc_cred_shrinker->count_objects = rpcauth_cache_shrink_count; rpc_cred_shrinker->scan_objects = rpcauth_cache_shrink_scan; shrinker_register(rpc_cred_shrinker); return 0; out2: rpc_destroy_authunix(); out1: return err; } void rpcauth_remove_module(void) { rpc_destroy_authunix(); shrinker_free(rpc_cred_shrinker); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IEEE802154_CORE_H #define __IEEE802154_CORE_H #include <net/cfg802154.h> struct cfg802154_registered_device { const struct cfg802154_ops *ops; struct list_head list; /* wpan_phy index, internal only */ int wpan_phy_idx; /* also protected by devlist_mtx */ int opencount; wait_queue_head_t dev_wait; /* protected by RTNL only */ int num_running_ifaces; /* associated wpan interfaces, protected by rtnl or RCU */ struct list_head wpan_dev_list; int devlist_generation, wpan_dev_id; /* must be last because of the way we do wpan_phy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wpan_phy wpan_phy __aligned(NETDEV_ALIGN); }; static inline struct cfg802154_registered_device * wpan_phy_to_rdev(struct wpan_phy *wpan_phy) { BUG_ON(!wpan_phy); return container_of(wpan_phy, struct cfg802154_registered_device, wpan_phy); } extern struct list_head cfg802154_rdev_list; extern int cfg802154_rdev_list_generation; int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net); /* free object */ void cfg802154_dev_free(struct cfg802154_registered_device *rdev); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx); struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx); #endif /* __IEEE802154_CORE_H */ |
55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAPOPS_H #define _LINUX_SWAPOPS_H #include <linux/radix-tree.h> #include <linux/bug.h> #include <linux/mm_types.h> #ifdef CONFIG_MMU #ifdef CONFIG_SWAP #include <linux/swapfile.h> #endif /* CONFIG_SWAP */ /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in * the low-order bits. * * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ #ifdef MAX_PHYSMEM_BITS #define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) #else /* MAX_PHYSMEM_BITS */ #define SWP_PFN_BITS min_t(int, \ sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \ SWP_TYPE_SHIFT) #endif /* MAX_PHYSMEM_BITS */ #define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) /** * Migration swap entry specific bitfield definitions. Layout: * * |----------+--------------------| * | swp_type | swp_offset | * |----------+--------+-+-+-------| * | | resv |D|A| PFN | * |----------+--------+-+-+-------| * * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A) * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D) * * Note: A/D bits will be stored in migration entries iff there're enough * free bits in arch specific swp offset. By default we'll ignore A/D bits * when migrating a page. Please refer to migration_entry_supports_ad() * for more information. If there're more bits besides PFN and A/D bits, * they should be reserved and always be zeros. */ #define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS) #define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1) #define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2) #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) static inline bool is_pfn_swap_entry(swp_entry_t entry); /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { if (pte_swp_exclusive(pte)) pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) pte = pte_swp_clear_uffd_wp(pte); return pte; } /* * Store a type+offset into a swp_entry_t in an arch-independent format */ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) { swp_entry_t ret; ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK); return ret; } /* * Extract the `type' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline unsigned swp_type(swp_entry_t entry) { return (entry.val >> SWP_TYPE_SHIFT); } /* * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline pgoff_t swp_offset(swp_entry_t entry) { return entry.val & SWP_OFFSET_MASK; } /* * This should only be called upon a pfn swap entry to get the PFN stored * in the swap entry. Please refers to is_pfn_swap_entry() for definition * of pfn swap entry. */ static inline unsigned long swp_offset_pfn(swp_entry_t entry) { VM_BUG_ON(!is_pfn_swap_entry(entry)); return swp_offset(entry) & SWP_PFN_MASK; } /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. */ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; pte = pte_swp_clear_flags(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. */ static inline pte_t swp_entry_to_pte(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pte(arch_entry); } static inline swp_entry_t radix_to_swp_entry(void *arg) { swp_entry_t entry; entry.val = xa_to_value(arg); return entry; } static inline void *swp_to_radix_entry(swp_entry_t entry) { return xa_mk_value(entry.val); } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_READ, offset); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_WRITE, offset); } static inline bool is_device_private_entry(swp_entry_t entry) { int type = swp_type(entry); return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset); } static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ || swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE; } static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE); } #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_private_entry(swp_entry_t entry) { return false; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return false; } static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return false; } static inline bool is_writable_device_exclusive_entry(swp_entry_t entry) { return false; } #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION static inline int is_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ || swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_writable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_readable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ); } static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); } static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ, offset); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); } /* * Returns whether the host has large enough swap offset field to support * carrying over pgtable A/D bits for page migrations. The result is * pretty much arch specific. */ static inline bool migration_entry_supports_ad(void) { #ifdef CONFIG_SWAP return swap_migration_ad_supported; #else /* CONFIG_SWAP */ return false; #endif /* CONFIG_SWAP */ } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_YOUNG); return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_YOUNG; /* Keep the old behavior of aging page after migration */ return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_DIRTY); return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_DIRTY; /* Keep the old behavior of clean page after migration */ return false; } extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline int is_migration_entry(swp_entry_t swp) { return 0; } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; } static inline int is_readable_migration_entry(swp_entry_t entry) { return 0; } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { return false; } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE /* * Support for hardware poisoned pages */ static inline swp_entry_t make_hwpoison_entry(struct page *page) { BUG_ON(!PageLocked(page)); return swp_entry(SWP_HWPOISON, page_to_pfn(page)); } static inline int is_hwpoison_entry(swp_entry_t entry) { return swp_type(entry) == SWP_HWPOISON; } #else static inline swp_entry_t make_hwpoison_entry(struct page *page) { return swp_entry(0, 0); } static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } #endif typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) /* * "Poisoned" here is meant in the very general sense of "future accesses are * invalid", instead of referring very specifically to hardware memory errors. * This marker is meant to represent any of various different causes of this. * * Note that, when encountered by the faulting logic, PTEs with this marker will * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error * logic. */ #define PTE_MARKER_POISONED BIT(1) /* * Indicates that, on fault, this PTE will case a SIGSEGV signal to be * sent. This means guard markers behave in effect as if the region were mapped * PROT_NONE, rather than if they were a memory hole or equivalent. */ #define PTE_MARKER_GUARD BIT(2) #define PTE_MARKER_MASK (BIT(3) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); } static inline bool is_pte_marker_entry(swp_entry_t entry) { return swp_type(entry) == SWP_PTE_MARKER; } static inline pte_marker pte_marker_get(swp_entry_t entry) { return swp_offset(entry) & PTE_MARKER_MASK; } static inline bool is_pte_marker(pte_t pte) { return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); } static inline swp_entry_t make_poisoned_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_POISONED); } static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_POISONED); } static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } static inline int is_guard_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_GUARD); } /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker * should be seen as a none pte; it's just that we have stored some information * onto the none pte so it becomes not-none any more. * * It should be used when the pte is file-backed, ram-based and backing * userspace pages, like shmem. It is not needed upon pgtables that do not * support pte markers at all. For example, it's not needed on anonymous * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. */ static inline int pte_none_mostly(pte_t pte) { return pte_none(pte) || is_pte_marker(pte); } static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding page is locked */ BUG_ON(is_migration_entry(entry) && !PageLocked(p)); return p; } static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) { struct folio *folio = pfn_folio(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding folio is locked */ BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); return folio; } /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They can either be used to represent unaddressable device * memory, to restrict access to a page undergoing migration or to represent a * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { /* Make sure the swp offset can always store the needed fields */ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page); extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new); extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { swp_entry_t arch_entry; if (pmd_swp_soft_dirty(pmd)) pmd = pmd_swp_clear_soft_dirty(pmd); if (pmd_swp_uffd_wp(pmd)) pmd = pmd_swp_clear_uffd_wp(pmd); arch_entry = __pmd_to_swp_entry(pmd); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pmd(arch_entry); } static inline int is_pmd_migration_entry(pmd_t pmd) { return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { BUILD_BUG(); } static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { BUILD_BUG(); } static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { return swp_entry(0, 0); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } static inline int is_pmd_migration_entry(pmd_t pmd) { return 0; } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ |
5 5 5 5 5 5 5 5 3 2 2 2 2 1 1 2 2 3 3 3 3 3 3 3 3 6 3 3 3 3 5 4 5 2 3 3 2 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 | // SPDX-License-Identifier: GPL-2.0-only /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * * Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation * * ChangeLog: * * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> * Changed the compression method from stem compression to "table lookup" * compression (see scripts/kallsyms.c for a more complete description) */ #include <linux/kallsyms.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/kdb.h> #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ #include <linux/ctype.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/kprobes.h> #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/bsearch.h> #include <linux/btf_ids.h> #include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result, size_t maxlen) { int len, skipped_first = 0; const char *tptr; const u8 *data; /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; len = *data; data++; off++; /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ if ((len & 0x80) != 0) { len = (len & 0x7F) | (*data << 7); data++; off++; } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ off += len; /* * For every byte on the compressed symbol data, copy the table * entry for that byte. */ while (len) { tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; data++; len--; while (*tptr) { if (skipped_first) { if (maxlen <= 1) goto tail; *result = *tptr; result++; maxlen--; } else skipped_first = 1; tptr++; } } tail: if (maxlen) *result = '\0'; /* Return to offset to the next symbol. */ return off; } /* * Get symbol type information. This is encoded as a single char at the * beginning of the symbol name. */ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, * and return the first char from this token. */ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } /* * Find the offset on the compressed stream given and index in the * kallsyms array. */ static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; int i, len; /* * Use the closest marker we have. We have markers every 256 positions, * so that should be close enough. */ name = &kallsyms_names[kallsyms_markers[pos >> 8]]; /* * Sequentially scan all the symbols up to the point we're searching * for. Every symbol is stored in a [<len>][<len> bytes of data] format, * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ for (i = 0; i < (pos & 0xFF); i++) { len = *name; /* * If MSB is 1, it is a "big" symbol, so we need to look into * the next byte (and skip it, too). */ if ((len & 0x80) != 0) len = ((len & 0x7F) | (name[1] << 7)) + 1; name = name + len + 1; } return name - kallsyms_names; } unsigned long kallsyms_sym_address(int idx) { /* values are unsigned offsets if --absolute-percpu is not in effect */ if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; /* ...otherwise, positive offsets are absolute values */ if (kallsyms_offsets[idx] >= 0) return kallsyms_offsets[idx]; /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } static unsigned int get_symbol_seq(int index) { unsigned int i, seq = 0; for (i = 0; i < 3; i++) seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i]; return seq; } static int kallsyms_lookup_names(const char *name, unsigned int *start, unsigned int *end) { int ret; int low, mid, high; unsigned int seq, off; char namebuf[KSYM_NAME_LEN]; low = 0; high = kallsyms_num_syms - 1; while (low <= high) { mid = low + (high - low) / 2; seq = get_symbol_seq(mid); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = strcmp(name, namebuf); if (ret > 0) low = mid + 1; else if (ret < 0) high = mid - 1; else break; } if (low > high) return -ESRCH; low = mid; while (low) { seq = get_symbol_seq(low - 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(name, namebuf)) break; low--; } *start = low; if (end) { high = mid; while (high < kallsyms_num_syms - 1) { seq = get_symbol_seq(high + 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(name, namebuf)) break; high++; } *end = high; } return 0; } /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { int ret; unsigned int i; /* Skip the search for empty string. */ if (!*name) return 0; ret = kallsyms_lookup_names(name, &i, NULL); if (!ret) return kallsyms_sym_address(get_symbol_seq(i)); return module_kallsyms_lookup_name(name); } /* * Iterate over all symbols in vmlinux. For symbols from modules use * module_kallsyms_on_each_symbol instead. */ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { char namebuf[KSYM_NAME_LEN]; unsigned long i; unsigned int off; int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, kallsyms_sym_address(i)); if (ret != 0) return ret; cond_resched(); } return 0; } int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { int ret; unsigned int i, start, end; ret = kallsyms_lookup_names(name, &start, &end); if (ret) return 0; for (i = start; !ret && i <= end; i++) { ret = fn(data, kallsyms_sym_address(get_symbol_seq(i))); cond_resched(); } return ret; } static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; /* Do a binary search on the sorted kallsyms_offsets array. */ low = 0; high = kallsyms_num_syms; while (high - low > 1) { mid = low + (high - low) / 2; if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; } /* * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { if (kallsyms_sym_address(i) > symbol_start) { symbol_end = kallsyms_sym_address(i); break; } } /* If we found no next symbol, we use the end of the section. */ if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; } if (symbolsize) *symbolsize = symbol_end - symbol_start; if (offset) *offset = addr - symbol_start; return low; } /* * Lookup an address but don't bother to find any names. */ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); return 1; } return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } static int kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { int ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; if (modbuildid) *modbuildid = NULL; return strlen(namebuf); } /* See if it's in a module or a BPF JITed image. */ ret = module_address_lookup(addr, symbolsize, offset, modname, modbuildid, namebuf); if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); if (!ret) ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); return ret; } /* * Lookup an address * - modname is set to NULL if it's in the kernel. * - We guarantee that the returned name is valid until we reschedule even if. * It resides in a module. * - We also guarantee that modname will be valid until rescheduled. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname, NULL, namebuf); if (!ret) return NULL; return namebuf; } int lookup_symbol_name(unsigned long addr, char *symname) { symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); return 0; } /* See if it's in a module. */ return lookup_module_symbol_name(addr, symname); } /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) { char *modname; const unsigned char *buildid; unsigned long offset, size; int len; address += symbol_offset; len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); if (!len) return sprintf(buffer, "0x%lx", address - symbol_offset); offset -= symbol_offset; if (add_offset) len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); if (modname) { len += sprintf(buffer + len, " [%s", modname); #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) if (add_buildid && buildid) { /* build ID should match length of sprintf */ #if IS_ENABLED(CONFIG_MODULES) static_assert(sizeof(typeof_member(struct module, build_id)) == 20); #endif len += sprintf(buffer + len, " %20phN", buildid); } #endif len += sprintf(buffer + len, "]"); } return len; } /** * sprint_symbol - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name, * offset, size and module name to @buffer if possible. If no symbol was found, * just saves its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 1, 0); } EXPORT_SYMBOL_GPL(sprint_symbol); /** * sprint_symbol_build_id - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name, * offset, size, module name and module build ID to @buffer if possible. If no * symbol was found, just saves its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol_build_id(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 1, 1); } EXPORT_SYMBOL_GPL(sprint_symbol_build_id); /** * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name * and module name to @buffer if possible. If no symbol was found, just saves * its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol_no_offset(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 0, 0); } EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); /** * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function is for stack backtrace and does the same thing as * sprint_symbol() but with modified/decreased @address. If there is a * tail-call to the function marked "noreturn", gcc optimized out code after * the call so that the stack-saved return address could point outside of the * caller. This function ensures that kallsyms will find the original caller * by decreasing @address. * * This function returns the number of bytes stored in @buffer. */ int sprint_backtrace(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, -1, 1, 0); } /** * sprint_backtrace_build_id - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function is for stack backtrace and does the same thing as * sprint_symbol() but with modified/decreased @address. If there is a * tail-call to the function marked "noreturn", gcc optimized out code after * the call so that the stack-saved return address could point outside of the * caller. This function ensures that kallsyms will find the original caller * by decreasing @address. This function also appends the module build ID to * the @buffer if @address is within a kernel module. * * This function returns the number of bytes stored in @buffer. */ int sprint_backtrace_build_id(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, -1, 1, 1); } /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; char name[KSYM_NAME_LEN]; char module_name[MODULE_NAME_LEN]; int exported; int show_value; }; static int get_ksymbol_mod(struct kallsym_iter *iter) { int ret = module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_mod_end = iter->pos; return 0; } return 1; } /* * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace * purposes. In that case "__builtin__ftrace" is used as a module name, even * though "__builtin__ftrace" is not a module. */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_ftrace_mod_end = iter->pos; return 0; } return 1; } static int get_ksymbol_bpf(struct kallsym_iter *iter) { int ret; strscpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, iter->name); if (ret < 0) { iter->pos_bpf_end = iter->pos; return 0; } return 1; } /* * This uses "__builtin__kprobes" as a module name for symbols for pages * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a * module. */ static int get_ksymbol_kprobe(struct kallsym_iter *iter) { strscpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); iter->exported = 0; return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, &iter->value, &iter->type, iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { unsigned off = iter->nameoff; iter->module_name[0] = '\0'; iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) { iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; iter->pos_bpf_end = 0; } } /* * The end position (last + 1) of each additional kallsyms section is recorded * in iter->pos_..._end as each section is added, and so can be used to * determine which get_ksymbol_...() function to call next. */ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && get_ksymbol_mod(iter)) return 1; if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) && get_ksymbol_ftrace_mod(iter)) return 1; if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && get_ksymbol_bpf(iter)) return 1; return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ if (pos >= kallsyms_num_syms) return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) reset_iter(iter, pos); iter->nameoff += get_ksymbol_core(iter); iter->pos++; return 1; } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { (*pos)++; if (!update_iter(m->private, *pos)) return NULL; return p; } static void *s_start(struct seq_file *m, loff_t *pos) { if (!update_iter(m->private, *pos)) return NULL; return m->private; } static void s_stop(struct seq_file *m, void *p) { } static int s_show(struct seq_file *m, void *p) { void *value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; value = iter->show_value ? (void *)iter->value : NULL; if (iter->module_name[0]) { char type; /* * Label it "global" if it is exported, * "local" if not exported. */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); seq_printf(m, "%px %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else seq_printf(m, "%px %c %s\n", value, iter->type, iter->name); return 0; } static const struct seq_operations kallsyms_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show }; #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__ksym { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct kallsym_iter *, ksym); }; static int ksym_prog_seq_show(struct seq_file *m, bool in_stop) { struct bpf_iter__ksym ctx; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = m; prog = bpf_iter_get_info(&meta, in_stop); if (!prog) return 0; ctx.meta = &meta; ctx.ksym = m ? m->private : NULL; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p) { return ksym_prog_seq_show(m, false); } static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p) { if (!p) (void) ksym_prog_seq_show(m, true); else s_stop(m, p); } static const struct seq_operations bpf_iter_ksym_ops = { .start = s_start, .next = s_next, .stop = bpf_iter_ksym_seq_stop, .show = bpf_iter_ksym_seq_show, }; static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux) { struct kallsym_iter *iter = priv_data; reset_iter(iter, 0); /* cache here as in kallsyms_open() case; use current process * credentials to tell BPF iterators if values should be shown. */ iter->show_value = kallsyms_show_value(current_cred()); return 0; } DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym) static const struct bpf_iter_seq_info ksym_iter_seq_info = { .seq_ops = &bpf_iter_ksym_ops, .init_seq_private = bpf_iter_ksym_init, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct kallsym_iter), }; static struct bpf_iter_reg ksym_iter_reg_info = { .target = "ksym", .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ksym, ksym), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &ksym_iter_seq_info, }; BTF_ID_LIST(btf_ksym_iter_id) BTF_ID(struct, kallsym_iter) static int __init bpf_ksym_iter_register(void) { ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id; return bpf_iter_reg_target(&ksym_iter_reg_info); } late_initcall(bpf_ksym_iter_register); #endif /* CONFIG_BPF_SYSCALL */ static int kallsyms_open(struct inode *inode, struct file *file) { /* * We keep iterator in m->private, since normal case is to * s_start from where we left off, so we avoid doing * using get_symbol_offset for every symbol. */ struct kallsym_iter *iter; iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); if (!iter) return -ENOMEM; reset_iter(iter, 0); /* * Instead of checking this on every s_show() call, cache * the result here at open time. */ iter->show_value = kallsyms_show_value(file->f_cred); return 0; } #ifdef CONFIG_KGDB_KDB const char *kdb_walk_kallsyms(loff_t *pos) { static struct kallsym_iter kdb_walk_kallsyms_iter; if (*pos == 0) { memset(&kdb_walk_kallsyms_iter, 0, sizeof(kdb_walk_kallsyms_iter)); reset_iter(&kdb_walk_kallsyms_iter, 0); } while (1) { if (!update_iter(&kdb_walk_kallsyms_iter, *pos)) return NULL; ++*pos; /* Some debugging symbols have no name. Ignore them. */ if (kdb_walk_kallsyms_iter.name[0]) return kdb_walk_kallsyms_iter.name; } } #endif /* CONFIG_KGDB_KDB */ static const struct proc_ops kallsyms_proc_ops = { .proc_open = kallsyms_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, }; static int __init kallsyms_init(void) { proc_create("kallsyms", 0444, NULL, &kallsyms_proc_ops); return 0; } device_initcall(kallsyms_init); |
10 10 10 10 9 10 10 10 9 10 10 10 10 10 10 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/mac80211/util.c */ #include "ieee802154_i.h" #include "driver-ops.h" /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; /** * ieee802154_wake_queue - wake ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we had to stop the queue to * avoid new skb to come during the transmission. The queue then needs to be * woken up after the operation. */ static void ieee802154_wake_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_wake_queue(sdata->dev); } rcu_read_unlock(); } /** * ieee802154_stop_queue - stop ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we need to tell upper layers to * stop giving us new skbs while we are busy with the transmitted one. The queue * must then be stopped before transmitting. */ static void ieee802154_stop_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_stop_queue(sdata->dev); } rcu_read_unlock(); } void ieee802154_hold_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (!atomic_fetch_inc(&local->phy->hold_txs)) ieee802154_stop_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_release_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (atomic_dec_and_test(&local->phy->hold_txs)) ieee802154_wake_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_disable_queue(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_tx_disable(sdata->dev); } rcu_read_unlock(); } enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer) { struct ieee802154_local *local = container_of(timer, struct ieee802154_local, ifs_timer); ieee802154_release_queue(local); return HRTIMER_NORESTART; } void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, bool ifs_handling) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = IEEE802154_SUCCESS; if (ifs_handling) { u8 max_sifs_size; /* If transceiver sets CRC on his own we need to use lifs * threshold len above 16 otherwise 18, because it's not * part of skb->len. */ if (hw->flags & IEEE802154_HW_TX_OMIT_CKSUM) max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE - IEEE802154_FCS_LEN; else max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE; if (skb->len > max_sifs_size) hrtimer_start(&local->ifs_timer, hw->phy->lifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); else hrtimer_start(&local->ifs_timer, hw->phy->sifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); } else { ieee802154_release_queue(local); } dev_consume_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_complete); void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, int reason) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = reason; ieee802154_release_queue(local); dev_kfree_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_error); void ieee802154_xmit_hw_error(struct ieee802154_hw *hw, struct sk_buff *skb) { ieee802154_xmit_error(hw, skb, IEEE802154_SYSTEM_ERROR); } EXPORT_SYMBOL(ieee802154_xmit_hw_error); void ieee802154_stop_device(struct ieee802154_local *local) { flush_workqueue(local->workqueue); hrtimer_cancel(&local->ifs_timer); drv_stop(local); } |
101 104 55 2 146 2 98 146 102 55 207 25 4 169 8 4 20 1 1 140 10 1 174 1 15 15 7 8 170 2 5 170 5 14 2 2 130 132 4 33 20 25 25 1 24 8 9 6 3 9 1 5 4 16 2 16 7 26 23 7 29 26 30 30 30 30 30 30 30 30 18 30 32 32 32 29 2 26 6 18 109 110 1 11 2 107 85 47 40 85 108 131 132 132 130 132 874 875 79 199 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 | // SPDX-License-Identifier: GPL-2.0-only #include <net/sock.h> #include <linux/ethtool_netlink.h> #include <linux/phy_link_topology.h> #include <linux/pm_runtime.h> #include "netlink.h" #include "module_fw.h" static struct genl_family ethtool_genl_family; static bool ethnl_ok __read_mostly; static u32 ethnl_bcast_seq; #define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS | \ ETHTOOL_FLAG_OMIT_REPLY) #define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS) const struct nla_policy ethnl_header_policy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), }; const struct nla_policy ethnl_header_policy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), }; const struct nla_policy ethnl_header_policy_phy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; const struct nla_policy ethnl_header_policy_phy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type) { struct ethnl_sock_priv *sk_priv; sk_priv = genl_sk_priv_get(ðtool_genl_family, NETLINK_CB(skb).sk); if (IS_ERR(sk_priv)) return PTR_ERR(sk_priv); sk_priv->dev = dev; sk_priv->portid = portid; sk_priv->type = type; return 0; } static void ethnl_sock_priv_destroy(void *priv) { struct ethnl_sock_priv *sk_priv = priv; switch (sk_priv->type) { case ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH: ethnl_module_fw_flash_sock_destroy(sk_priv); break; default: break; } } int ethnl_ops_begin(struct net_device *dev) { int ret; if (!dev) return -ENODEV; if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); if (!netif_device_present(dev) || dev->reg_state == NETREG_UNREGISTERING) { ret = -ENODEV; goto err; } if (dev->ethtool_ops->begin) { ret = dev->ethtool_ops->begin(dev); if (ret) goto err; } return 0; err: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); return ret; } void ethnl_ops_complete(struct net_device *dev) { if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (dev->dev.parent) pm_runtime_put(dev->dev.parent); } /** * ethnl_parse_header_dev_get() - parse request header * @req_info: structure to put results into * @header: nest attribute with request header * @net: request netns * @extack: netlink extack for error reporting * @require_dev: fail if no device identified in header * * Parse request header in nested attribute @nest and puts results into * the structure pointed to by @req_info. Extack from @info is used for error * reporting. If req_info->dev is not null on return, reference to it has * been taken. If error is returned, *req_info is null initialized and no * reference is held. * * Return: 0 on success or negative error code */ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *header, struct net *net, struct netlink_ext_ack *extack, bool require_dev) { struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)]; const struct nlattr *devname_attr; struct net_device *dev = NULL; u32 flags = 0; int ret; if (!header) { if (!require_dev) return 0; NL_SET_ERR_MSG(extack, "request header missing"); return -EINVAL; } /* No validation here, command policy should have a nested policy set * for the header, therefore validation should have already been done. */ ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header, NULL, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_HEADER_FLAGS]) flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_DEV_INDEX], "no device matches ifindex"); return -ENODEV; } /* if both ifindex and ifname are passed, they must match */ if (devname_attr && strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { netdev_put(dev, &req_info->dev_tracker); NL_SET_ERR_MSG_ATTR(extack, header, "ifindex and name do not match"); return -ENODEV; } } else if (devname_attr) { dev = netdev_get_by_name(net, nla_data(devname_attr), &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, devname_attr, "no device matches name"); return -ENODEV; } } else if (require_dev) { NL_SET_ERR_MSG_ATTR(extack, header, "neither ifindex nor name specified"); return -EINVAL; } if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { if (dev) { req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]); } else { NL_SET_ERR_MSG_ATTR(extack, header, "phy_index set without a netdev"); return -EINVAL; } } req_info->dev = dev; req_info->flags = flags; return 0; } struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, const struct nlattr *header, struct netlink_ext_ack *extack) { struct phy_device *phydev; ASSERT_RTNL(); if (!req_info->dev) return NULL; if (!req_info->phy_index) return req_info->dev->phydev; phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index); if (!phydev) { NL_SET_ERR_MSG_ATTR(extack, header, "no phy matching phyindex"); return ERR_PTR(-ENODEV); } return phydev; } /** * ethnl_fill_reply_header() - Put common header into a reply message * @skb: skb with the message * @dev: network device to describe in header * @attrtype: attribute type to use for the nest * * Create a nested attribute with attributes describing given network device. * * Return: 0 on success, error value (-EMSGSIZE only) on error */ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype) { struct nlattr *nest; if (!dev) return 0; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) || nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name)) goto nla_put_failure; /* If more attributes are put into reply header, ethnl_header_size() * must be updated to account for them. */ nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } /** * ethnl_reply_init() - Create skb for a reply and fill device identification * @payload: payload length (without netlink and genetlink header) * @dev: device the reply is about (may be null) * @cmd: ETHTOOL_MSG_* message type for reply * @hdr_attrtype: attribute type for common header * @info: genetlink info of the received packet we respond to * @ehdrp: place to store payload pointer returned by genlmsg_new() * * Return: pointer to allocated skb on success, NULL on error */ struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp) { struct sk_buff *skb; skb = genlmsg_new(payload, GFP_KERNEL); if (!skb) goto err; *ehdrp = genlmsg_put_reply(skb, info, ðtool_genl_family, 0, cmd); if (!*ehdrp) goto err_free; if (dev) { int ret; ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype); if (ret < 0) goto err_free; } return skb; err_free: nlmsg_free(skb); err: if (info) GENL_SET_ERR_MSG(info, "failed to setup reply message"); return NULL; } void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) { return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, ðtool_genl_family, 0, cmd); } void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, cmd); } void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd) { return genlmsg_put(skb, portid, seq, ðtool_genl_family, 0, cmd); } int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) { return genlmsg_multicast_netns(ðtool_genl_family, dev_net(dev), skb, 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); } /* GET request helpers */ /** * struct ethnl_dump_ctx - context structure for generic dumpit() callback * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used * in ethnl_default_dumpit() and ethnl_default_done(). */ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; unsigned long pos_ifindex; }; static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKINFO_SET] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKMODES_SET] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = ðnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = ðnl_debug_request_ops, [ETHTOOL_MSG_DEBUG_SET] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = ðnl_wol_request_ops, [ETHTOOL_MSG_WOL_SET] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = ðnl_privflags_request_ops, [ETHTOOL_MSG_PRIVFLAGS_SET] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = ðnl_rings_request_ops, [ETHTOOL_MSG_RINGS_SET] = ðnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_GET] = ðnl_channels_request_ops, [ETHTOOL_MSG_CHANNELS_SET] = ðnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_COALESCE_SET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, [ETHTOOL_MSG_PAUSE_SET] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, [ETHTOOL_MSG_EEE_SET] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_FEC_SET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, [ETHTOOL_MSG_MODULE_SET] = ðnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, [ETHTOOL_MSG_PSE_SET] = ðnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, [ETHTOOL_MSG_PLCA_GET_CFG] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_SET_CFG] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, [ETHTOOL_MSG_MM_GET] = ðnl_mm_request_ops, [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) { return (struct ethnl_dump_ctx *)cb->ctx; } /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into * @info: genl_info from the request * @request_ops: struct request_ops for request type * @require_dev: fail if no device identified in header * * Parse universal request header and call request specific ->parse_request() * callback (if defined) to parse the rest of the message. * * Return: 0 on success or negative error code */ static int ethnl_default_parse(struct ethnl_req_info *req_info, const struct genl_info *info, const struct ethnl_request_ops *request_ops, bool require_dev) { struct nlattr **tb = info->attrs; int ret; ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr], genl_info_net(info), info->extack, require_dev); if (ret < 0) return ret; if (request_ops->parse_request) { ret = request_ops->parse_request(req_info, tb, info->extack); if (ret < 0) return ret; } return 0; } /** * ethnl_init_reply_data() - Initialize reply data for GET request * @reply_data: pointer to embedded struct ethnl_reply_data * @ops: instance of struct ethnl_request_ops describing the layout * @dev: network device to initialize the reply for * * Fills the reply data part with zeros and sets the dev member. Must be called * before calling the ->fill_reply() callback (for each iteration when handling * dump requests). */ static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data, const struct ethnl_request_ops *ops, struct net_device *dev) { memset(reply_data, 0, ops->reply_data_size); reply_data->dev = dev; } /* default ->doit() handler for GET type requests */ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) { struct ethnl_reply_data *reply_data = NULL; struct ethnl_req_info *req_info = NULL; const u8 cmd = info->genlhdr->cmd; const struct ethnl_request_ops *ops; int hdr_len, reply_len; struct sk_buff *rskb; void *reply_payload; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return -ENOMEM; } ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do); if (ret < 0) goto err_dev; ethnl_init_reply_data(reply_data, ops, req_info->dev); rtnl_lock(); ret = ops->prepare_data(req_info, reply_data, info); rtnl_unlock(); if (ret < 0) goto err_cleanup; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(), req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); if (!rskb) goto err_cleanup; hdr_len = rskb->len; ret = ops->fill_reply(rskb, req_info, reply_data); if (ret < 0) goto err_msg; WARN_ONCE(rskb->len - hdr_len > reply_len, "ethnl cmd %d: calculated reply length %d, but consumed %d\n", cmd, reply_len, rskb->len - hdr_len); if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(rskb, reply_payload); netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return genlmsg_reply(rskb, info); err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); nlmsg_free(rskb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_dev: netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return ret; } static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, const struct ethnl_dump_ctx *ctx, const struct genl_info *info) { void *ehdr; int ret; ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, ðtool_genl_family, NLM_F_MULTI, ctx->ops->reply_cmd); if (!ehdr) return -EMSGSIZE; ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); rtnl_unlock(); if (ret < 0) goto out; ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); if (ret < 0) goto out; ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data); out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); ctx->reply_data->dev = NULL; if (ret < 0) genlmsg_cancel(skb, ehdr); else genlmsg_end(skb, ehdr); return ret; } /* Default ->dumpit() handler for GET requests. */ static int ethnl_default_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { dev_hold(dev); rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rcu_read_lock(); dev_put(dev); if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) ret = skb->len; break; } ret = 0; } rcu_read_unlock(); return ret; } /* generic ->start() handler for GET requests */ static int ethnl_default_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genlmsghdr *ghdr; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); ghdr = nlmsg_data(cb->nlh); ops = ethnl_default_requests[ghdr->cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) return -EOPNOTSUPP; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { /* We ignore device specification in dump requests but as the * same parser as for non-dump (doit) requests is used, it * would take reference to the device if it finds one */ netdev_put(req_info->dev, &req_info->dev_tracker); req_info->dev = NULL; } if (ret < 0) goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } /* default ->done() handler for GET requests */ static int ethnl_default_done(struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); kfree(ctx->reply_data); kfree(ctx->req_info); return 0; } static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct ethnl_request_ops *ops; struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr], genl_info_net(info), info->extack, true); if (ret < 0) return ret; if (ops->set_validate) { ret = ops->set_validate(&req_info, info); /* 0 means nothing to do */ if (ret <= 0) goto out_dev; } rtnl_lock(); ret = ethnl_ops_begin(req_info.dev); if (ret < 0) goto out_rtnl; ret = ops->set(&req_info, info); if (ret <= 0) goto out_ops; ethtool_notify(req_info.dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: ethnl_ops_complete(req_info.dev); out_rtnl: rtnl_unlock(); out_dev: ethnl_parse_header_dev_put(&req_info); return ret; } static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = ðnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_NTF] = ðnl_linkmodes_request_ops, [ETHTOOL_MSG_DEBUG_NTF] = ðnl_debug_request_ops, [ETHTOOL_MSG_WOL_NTF] = ðnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_NTF] = ðnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_NTF] = ðnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_NTF] = ðnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_NTF] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, [ETHTOOL_MSG_PLCA_NTF] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_MM_NTF] = ðnl_mm_request_ops, }; /* default notification handler */ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, const void *data) { struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genl_info info; struct sk_buff *skb; void *reply_payload; int reply_len; int ret; genl_info_init_ntf(&info, ðtool_genl_family, cmd); if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || !ethnl_default_notify_ops[cmd], "unexpected notification type %u\n", cmd)) return; ops = ethnl_default_notify_ops[cmd]; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return; } req_info->dev = dev; req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; ethnl_init_reply_data(reply_data, ops, dev); ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) goto err_cleanup; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret + ethnl_reply_header_size(); skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) goto err_cleanup; reply_payload = ethnl_bcastmsg_put(skb, cmd); if (!reply_payload) goto err_skb; ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr); if (ret < 0) goto err_msg; ret = ops->fill_reply(skb, req_info, reply_data); if (ret < 0) goto err_msg; if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(skb, reply_payload); kfree(reply_data); kfree(req_info); ethnl_multicast(skb, dev); return; err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); err_skb: nlmsg_free(skb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); kfree(reply_data); kfree(req_info); return; } /* notifications */ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, const void *data); static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) { if (unlikely(!ethnl_ok)) return; ASSERT_RTNL(); if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && ethnl_notify_handlers[cmd])) ethnl_notify_handlers[cmd](dev, cmd, data); else WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", cmd, netdev_name(dev)); } EXPORT_SYMBOL(ethtool_notify); static void ethnl_notify_features(struct netdev_notifier_info *info) { struct net_device *dev = netdev_notifier_info_to_dev(info); ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); } static int ethnl_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *info = ptr; struct netlink_ext_ack *extack; struct net_device *dev; dev = netdev_notifier_info_to_dev(info); extack = netdev_notifier_info_to_extack(info); switch (event) { case NETDEV_FEAT_CHANGE: ethnl_notify_features(ptr); break; case NETDEV_PRE_UP: if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Can't set port up while flashing module firmware"); return NOTIFY_BAD; } } return NOTIFY_DONE; } static struct notifier_block ethnl_netdev_notifier = { .notifier_call = ethnl_netdev_event, }; /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_STRSET_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_strset_get_policy, .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkinfo_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkmodes_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkmodes_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKSTATE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkstate_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_debug_get_policy, .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_debug_set_policy, .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_wol_get_policy, .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_wol_set_policy, .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_features_get_policy, .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_features, .policy = ethnl_features_set_policy, .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_privflags_get_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_privflags_set_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_rings_get_policy, .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_rings_set_policy, .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_channels_get_policy, .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_channels_set_policy, .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_coalesce_get_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_coalesce_set_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_pause_get_policy, .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pause_set_policy, .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_eee_get_policy, .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_eee_set_policy, .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSINFO_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_tsinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test, .policy = ethnl_cable_test_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, .policy = ethnl_cable_test_tdr_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, .doit = ethnl_tunnel_info_doit, .start = ethnl_tunnel_info_start, .dumpit = ethnl_tunnel_info_dumpit, .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_fec_get_policy, .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_eeprom_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_STATS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_stats_get_policy, .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_phc_vclocks_get_policy, .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_module_set_policy, .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_pse_get_policy, .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RSS_GET, .doit = ethnl_default_doit, .start = ethnl_rss_dump_start, .dumpit = ethnl_rss_dumpit, .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_CFG, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_plca_get_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_SET_CFG, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_plca_set_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_plca_get_status_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_mm_get_policy, .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_mm_set_policy, .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_FW_FLASH_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_module_fw_flash, .policy = ethnl_module_fw_flash_act_policy, .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHY_GET, .doit = ethnl_phy_doit, .start = ethnl_phy_start, .dumpit = ethnl_phy_dumpit, .done = ethnl_phy_done, .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME }, }; static struct genl_family ethtool_genl_family __ro_after_init = { .name = ETHTOOL_GENL_NAME, .version = ETHTOOL_GENL_VERSION, .netnsok = true, .parallel_ops = true, .ops = ethtool_genl_ops, .n_ops = ARRAY_SIZE(ethtool_genl_ops), .resv_start_op = ETHTOOL_MSG_MODULE_GET + 1, .mcgrps = ethtool_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps), .sock_priv_size = sizeof(struct ethnl_sock_priv), .sock_priv_destroy = ethnl_sock_priv_destroy, }; /* module setup */ static int __init ethnl_init(void) { int ret; ret = genl_register_family(ðtool_genl_family); if (WARN(ret < 0, "ethtool: genetlink family registration failed")) return ret; ethnl_ok = true; ret = register_netdevice_notifier(ðnl_netdev_notifier); WARN(ret < 0, "ethtool: net device notifier registration failed"); return ret; } subsys_initcall(ethnl_init); |
84 84 92 93 2 91 93 83 84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <asm/ucontext.h> #include <asm/fpu/signal.h> #include <asm/sighandling.h> #include <asm/syscall.h> #include <asm/sigframe.h> #include <asm/signal.h> /* * If regs->ss will cause an IRET fault, change it. Otherwise leave it * alone. Using this generally makes no sense unless * user_64bit_mode(regs) would return true. */ static void force_valid_ss(struct pt_regs *regs) { u32 ar; asm volatile ("lar %[old_ss], %[ar]\n\t" "jz 1f\n\t" /* If invalid: */ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */ "1:" : [ar] "=r" (ar) : [old_ss] "rm" ((u16)regs->ss)); /* * For a valid 64-bit user context, we need DPL 3, type * read-write data or read-write exp-down data, and S and P * set. We can't use VERW because VERW doesn't check the * P bit. */ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK; if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) && ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) regs->ss = __USER_DS; } static bool restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, unsigned long uc_flags) { struct sigcontext sc; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1))) return false; regs->bx = sc.bx; regs->cx = sc.cx; regs->dx = sc.dx; regs->si = sc.si; regs->di = sc.di; regs->bp = sc.bp; regs->ax = sc.ax; regs->sp = sc.sp; regs->ip = sc.ip; regs->r8 = sc.r8; regs->r9 = sc.r9; regs->r10 = sc.r10; regs->r11 = sc.r11; regs->r12 = sc.r12; regs->r13 = sc.r13; regs->r14 = sc.r14; regs->r15 = sc.r15; /* Get CS/SS and force CPL3 */ regs->cs = sc.cs | 0x03; regs->ss = sc.ss | 0x03; regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); /* disable syscall checks */ regs->orig_ax = -1; /* * Fix up SS if needed for the benefit of old DOSEMU and * CRIU. */ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) force_valid_ss(regs); return fpu__restore_sig((void __user *)sc.fpstate, 0); } static __always_inline int __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { unsafe_put_user(regs->di, &sc->di, Efault); unsafe_put_user(regs->si, &sc->si, Efault); unsafe_put_user(regs->bp, &sc->bp, Efault); unsafe_put_user(regs->sp, &sc->sp, Efault); unsafe_put_user(regs->bx, &sc->bx, Efault); unsafe_put_user(regs->dx, &sc->dx, Efault); unsafe_put_user(regs->cx, &sc->cx, Efault); unsafe_put_user(regs->ax, &sc->ax, Efault); unsafe_put_user(regs->r8, &sc->r8, Efault); unsafe_put_user(regs->r9, &sc->r9, Efault); unsafe_put_user(regs->r10, &sc->r10, Efault); unsafe_put_user(regs->r11, &sc->r11, Efault); unsafe_put_user(regs->r12, &sc->r12, Efault); unsafe_put_user(regs->r13, &sc->r13, Efault); unsafe_put_user(regs->r14, &sc->r14, Efault); unsafe_put_user(regs->r15, &sc->r15, Efault); unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); unsafe_put_user(current->thread.error_code, &sc->err, Efault); unsafe_put_user(regs->ip, &sc->ip, Efault); unsafe_put_user(regs->flags, &sc->flags, Efault); unsafe_put_user(regs->cs, &sc->cs, Efault); unsafe_put_user(0, &sc->gs, Efault); unsafe_put_user(0, &sc->fs, Efault); unsafe_put_user(regs->ss, &sc->ss, Efault); unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); /* non-iBCS2 extensions.. */ unsafe_put_user(mask, &sc->oldmask, Efault); unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); return 0; Efault: return -EFAULT; } #define unsafe_put_sigcontext(sc, fp, regs, set, label) \ do { \ if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ goto label; \ } while(0); #define unsafe_put_sigmask(set, frame, label) \ unsafe_put_user(*(__u64 *)(set), \ (__u64 __user *)&(frame)->uc.uc_sigmask, \ label) static unsigned long frame_uc_flags(struct pt_regs *regs) { unsigned long flags; if (boot_cpu_has(X86_FEATURE_XSAVE)) flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS; else flags = UC_SIGCONTEXT_SS; if (likely(user_64bit_mode(regs))) flags |= UC_STRICT_RESTORE_SS; return flags; } int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *set = sigmask_to_save(); struct rt_sigframe __user *frame; void __user *fp = NULL; unsigned long uc_flags; /* x86-64 should always use SA_RESTORER. */ if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; /* Create the ucontext. */ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); /* Set up to return from userspace. If provided, use a stub already in userspace. */ unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); unsafe_put_sigmask(set, frame, Efault); user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } if (setup_signal_shadow_stack(ksig)) return -EFAULT; /* Set up registers for signal handler */ regs->di = ksig->sig; /* In case the signal handler was declared without prototypes */ regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the next argument after the signal number on the stack. */ regs->si = (unsigned long)&frame->info; regs->dx = (unsigned long)&frame->uc; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; regs->sp = (unsigned long)frame; /* * Set up the CS and SS registers to run signal handlers in * 64-bit mode, even if the handler happens to be interrupting * 32-bit or 16-bit code. * * SS is subtle. In 64-bit mode, we don't need any particular * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU * avoids relying on sigreturn to restore SS; instead it uses * a trampoline.) So we do our best: if the old SS was valid, * we keep it. Otherwise we replace it. */ regs->cs = __USER_CS; if (unlikely(regs->ss != __USER_DS)) force_valid_ss(regs); return 0; Efault: user_access_end(); return -EFAULT; } /* * Do a signal return; undo the signal stack. */ SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; sigset_t set; unsigned long uc_flags; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); if (restore_altstack(&frame->uc.uc_stack)) goto badframe; if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_signal_shadow_stack()) goto badframe; return regs->ax; badframe: signal_fault(regs, frame, "rt_sigreturn"); return 0; } #ifdef CONFIG_X86_X32_ABI static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { struct compat_siginfo new; copy_siginfo_to_external32(&new, from); if (from->si_signo == SIGCHLD) { new._sifields._sigchld_x32._utime = from->si_utime; new._sifields._sigchld_x32._stime = from->si_stime; } if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; return 0; } int copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { if (in_x32_syscall()) return x32_copy_siginfo_to_user(to, from); return __copy_siginfo_to_user32(to, from); } int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) { compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save(); struct rt_sigframe_x32 __user *frame; unsigned long uc_flags; void __user *restorer; void __user *fp = NULL; if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; frame = get_sigframe(ksig, regs, sizeof(*frame), &fp); uc_flags = frame_uc_flags(regs); if (setup_signal_shadow_stack(ksig)) return -EFAULT; if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; /* Create the ucontext. */ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); unsafe_put_user(0, &frame->uc.uc_link, Efault); unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); unsafe_put_user(0, &frame->uc.uc__pad0, Efault); restorer = ksig->ka.sa.sa_restorer; unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); unsafe_put_sigmask(set, frame, Efault); user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; regs->ip = (unsigned long) ksig->ka.sa.sa_handler; /* We use the x32 calling convention here... */ regs->di = ksig->sig; regs->si = (unsigned long) &frame->info; regs->dx = (unsigned long) &frame->uc; loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); regs->cs = __USER_CS; regs->ss = __USER_DS; return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; unsigned long uc_flags; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_signal_shadow_stack()) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; return regs->ax; badframe: signal_fault(regs, frame, "x32 rt_sigreturn"); return 0; } #endif /* CONFIG_X86_X32_ABI */ #ifdef CONFIG_COMPAT void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { if (!act) return; if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) act->sa.sa_flags |= SA_X32_ABI; } #endif /* CONFIG_COMPAT */ /* * If adding a new si_code, there is probably new data in * the siginfo. Make sure folks bumping the si_code * limits also have to look at this code. Make sure any * new fields are handled in copy_siginfo_to_user32()! */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); static_assert(NSIGSYS == 2); /* This is part of the ABI and can never change in size: */ static_assert(sizeof(siginfo_t) == 128); /* This is a part of the ABI and can never change in alignment */ static_assert(__alignof__(siginfo_t) == 8); /* * The offsets of all the (unioned) si_fields are fixed * in the ABI, of course. Make sure none of them ever * move and are always at the beginning: */ static_assert(offsetof(siginfo_t, si_signo) == 0); static_assert(offsetof(siginfo_t, si_errno) == 4); static_assert(offsetof(siginfo_t, si_code) == 8); /* * Ensure that the size of each si_field never changes. * If it does, it is a sign that the * copy_siginfo_to_user32() code below needs to updated * along with the size in the CHECK_SI_SIZE(). * * We repeat this check for both the generic and compat * siginfos. * * Note: it is OK for these to grow as long as the whole * structure stays within the padding size (checked * above). */ #define CHECK_SI_OFFSET(name) \ static_assert(offsetof(siginfo_t, _sifields) == \ offsetof(siginfo_t, _sifields.name)) #define CHECK_SI_SIZE(name, size) \ static_assert(sizeof_field(siginfo_t, _sifields.name) == size) CHECK_SI_OFFSET(_kill); CHECK_SI_SIZE (_kill, 2*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); CHECK_SI_OFFSET(_timer); CHECK_SI_SIZE (_timer, 6*sizeof(int)); static_assert(offsetof(siginfo_t, si_tid) == 0x10); static_assert(offsetof(siginfo_t, si_overrun) == 0x14); static_assert(offsetof(siginfo_t, si_value) == 0x18); CHECK_SI_OFFSET(_rt); CHECK_SI_SIZE (_rt, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_value) == 0x18); CHECK_SI_OFFSET(_sigchld); CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_status) == 0x18); static_assert(offsetof(siginfo_t, si_utime) == 0x20); static_assert(offsetof(siginfo_t, si_stime) == 0x28); #ifdef CONFIG_X86_X32_ABI /* no _sigchld_x32 in the generic siginfo_t */ static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) == 7*sizeof(int)); static_assert(offsetof(compat_siginfo_t, _sifields) == offsetof(compat_siginfo_t, _sifields._sigchld_x32)); static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18); static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20); #endif CHECK_SI_OFFSET(_sigfault); CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); static_assert(offsetof(siginfo_t, si_addr) == 0x10); static_assert(offsetof(siginfo_t, si_trapno) == 0x18); static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); static_assert(offsetof(siginfo_t, si_lower) == 0x20); static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); CHECK_SI_OFFSET(_sigpoll); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); CHECK_SI_OFFSET(_sigsys); CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); static_assert(offsetof(siginfo_t, si_syscall) == 0x18); static_assert(offsetof(siginfo_t, si_arch) == 0x1C); /* any new si_fields should be added here */ |
10 10 1 1 4 4 4 4 4 4 4 4 1 4 4 1 1 4 4 4 4 4 6 6 1 6 6 5 5 1 4 3 6 1 5 5 1 1 5 6 1 5 1 1 1 1 10 2 8 6 1 6 11 11 11 6 6 1 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame router for HSR and PRP. */ #include "hsr_forward.h" #include <linux/types.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_framereg.h" struct hsr_node; /* The uses I can see for these HSR supervision frames are: * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = * 22") to reset any sequence_nr counters belonging to that node. Useful if * the other node's counter has been reset for some reason. * -- * Or not - resetting the counter and bridging the frame would create a * loop, unfortunately. * * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck * frame is received from a particular node, we know something is wrong. * We just register these (as with normal frames) and throw them away. * * 3) Allow different MAC addresses for the two slave interfaces, using the * MacAddressA field. */ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { struct ethhdr *eth_hdr; struct hsr_sup_tag *hsr_sup_tag; struct hsrv1_ethhdr_sp *hsr_V1_hdr; struct hsr_sup_tlv *hsr_sup_tlv; u16 total_length = 0; WARN_ON_ONCE(!skb_mac_header_was_set(skb)); eth_hdr = (struct ethhdr *)skb_mac_header(skb); /* Correct addr? */ if (!ether_addr_equal(eth_hdr->h_dest, hsr->sup_multicast_addr)) return false; /* Correct ether type?. */ if (!(eth_hdr->h_proto == htons(ETH_P_PRP) || eth_hdr->h_proto == htons(ETH_P_HSR))) return false; /* Get the supervision header from correct location. */ if (eth_hdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */ total_length = sizeof(struct hsrv1_ethhdr_sp); if (!pskb_may_pull(skb, total_length)) return false; hsr_V1_hdr = (struct hsrv1_ethhdr_sp *)skb_mac_header(skb); if (hsr_V1_hdr->hsr.encap_proto != htons(ETH_P_PRP)) return false; hsr_sup_tag = &hsr_V1_hdr->hsr_sup; } else { total_length = sizeof(struct hsrv0_ethhdr_sp); if (!pskb_may_pull(skb, total_length)) return false; hsr_sup_tag = &((struct hsrv0_ethhdr_sp *)skb_mac_header(skb))->hsr_sup; } if (hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_ANNOUNCE && hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_LIFE_CHECK && hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD && hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA) return false; if (hsr_sup_tag->tlv.HSR_TLV_length != 12 && hsr_sup_tag->tlv.HSR_TLV_length != sizeof(struct hsr_sup_payload)) return false; /* Get next tlv */ total_length += hsr_sup_tag->tlv.HSR_TLV_length; if (!pskb_may_pull(skb, total_length)) return false; skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; skb_push(skb, total_length); /* if this is a redbox supervision frame we need to verify * that more data is available */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* tlv length must be a length of a mac address */ if (hsr_sup_tlv->HSR_TLV_length != sizeof(struct hsr_sup_payload)) return false; /* make sure another tlv follows */ total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length; if (!pskb_may_pull(skb, total_length)) return false; /* get next tlv */ skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; skb_push(skb, total_length); } /* end of tlvs must follow at the end */ if (hsr_sup_tlv->HSR_TLV_type == HSR_TLV_EOT && hsr_sup_tlv->HSR_TLV_length != 0) return false; return true; } static bool is_proxy_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { struct hsr_sup_payload *payload; struct ethhdr *eth_hdr; u16 total_length = 0; eth_hdr = (struct ethhdr *)skb_mac_header(skb); /* Get the HSR protocol revision. */ if (eth_hdr->h_proto == htons(ETH_P_HSR)) total_length = sizeof(struct hsrv1_ethhdr_sp); else total_length = sizeof(struct hsrv0_ethhdr_sp); if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_payload))) return false; skb_pull(skb, total_length); payload = (struct hsr_sup_payload *)skb->data; skb_push(skb, total_length); /* For RedBox (HSR-SAN) check if we have received the supervision * frame with MAC addresses from own ProxyNodeTable. */ return hsr_is_node_in_db(&hsr->proxy_node_db, payload->macaddress_A); } static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in, struct hsr_frame_info *frame) { struct sk_buff *skb; int copylen; unsigned char *dst, *src; skb_pull(skb_in, HSR_HLEN); skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC); skb_push(skb_in, HSR_HLEN); if (!skb) return NULL; skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start -= HSR_HLEN; copylen = 2 * ETH_ALEN; if (frame->is_vlan) copylen += VLAN_HLEN; src = skb_mac_header(skb_in); dst = skb_mac_header(skb); memcpy(dst, src, copylen); skb->protocol = eth_hdr(skb)->h_proto; return skb; } struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { if (frame->skb_hsr) frame->skb_std = create_stripped_skb_hsr(frame->skb_hsr, frame); else netdev_warn_once(port->dev, "Unexpected frame received in hsr_get_untagged_frame()\n"); if (!frame->skb_std) return NULL; } return skb_clone(frame->skb_std, GFP_ATOMIC); } struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { if (frame->skb_prp) { /* trim the skb by len - HSR_HLEN to exclude RCT */ skb_trim(frame->skb_prp, frame->skb_prp->len - HSR_HLEN); frame->skb_std = __pskb_copy(frame->skb_prp, skb_headroom(frame->skb_prp), GFP_ATOMIC); } else { /* Unexpected */ WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", __FILE__, __LINE__, port->dev->name); return NULL; } } return skb_clone(frame->skb_std, GFP_ATOMIC); } static void prp_set_lan_id(struct prp_rct *trailer, struct hsr_port *port) { int lane_id; if (port->type == HSR_PT_SLAVE_A) lane_id = 0; else lane_id = 1; /* Add net_id in the upper 3 bits of lane_id */ lane_id |= port->hsr->net_id; set_prp_lan_id(trailer, lane_id); } /* Tailroom for PRP rct should have been created before calling this */ static struct sk_buff *prp_fill_rct(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port) { struct prp_rct *trailer; int min_size = ETH_ZLEN; int lsdu_size; if (!skb) return skb; if (frame->is_vlan) min_size = VLAN_ETH_ZLEN; if (skb_put_padto(skb, min_size)) return NULL; trailer = (struct prp_rct *)skb_put(skb, HSR_HLEN); lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; prp_set_lan_id(trailer, port); set_prp_LSDU_size(trailer, lsdu_size); trailer->sequence_nr = htons(frame->sequence_nr); trailer->PRP_suffix = htons(ETH_P_PRP); skb->protocol = eth_hdr(skb)->h_proto; return skb; } static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, struct hsr_port *port) { int path_id; if (port->type == HSR_PT_SLAVE_A) path_id = 0; else path_id = 1; set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); } static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; unsigned char *pc; int lsdu_size; /* pad to minimum packet size which is 60 + 6 (HSR tag) */ if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) return NULL; lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; pc = skb_mac_header(skb); if (frame->is_vlan) /* This 4-byte shift (size of a vlan tag) does not * mean that the ethhdr starts there. But rather it * provides the proper environment for accessing * the fields, such as hsr_tag etc., just like * when the vlan tag is not there. This is because * the hsr tag is after the vlan tag. */ hsr_ethhdr = (struct hsr_ethhdr *)(pc + VLAN_HLEN); else hsr_ethhdr = (struct hsr_ethhdr *)pc; hsr_set_path_id(hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; hsr_ethhdr->ethhdr.h_proto = htons(proto_version ? ETH_P_HSR : ETH_P_PRP); skb->protocol = hsr_ethhdr->ethhdr.h_proto; return skb; } /* If the original frame was an HSR tagged frame, just clone it to be sent * unchanged. Otherwise, create a private frame especially tagged for 'port'. */ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { unsigned char *dst, *src; struct sk_buff *skb; int movelen; if (frame->skb_hsr) { struct hsr_ethhdr *hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); /* set the lane id properly */ hsr_set_path_id(hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); } /* Create the new skb with enough headroom to fit the HSR tag */ skb = __pskb_copy(frame->skb_std, skb_headroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); if (!skb) return NULL; skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += HSR_HLEN; movelen = ETH_HLEN; if (frame->is_vlan) movelen += VLAN_HLEN; src = skb_mac_header(skb); dst = skb_push(skb, HSR_HLEN); memmove(dst, src, movelen); skb_reset_mac_header(skb); /* skb_put_padto free skb on error and hsr_fill_tag returns NULL in * that case */ return hsr_fill_tag(skb, frame, port, port->hsr->prot_version); } struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { struct sk_buff *skb; if (frame->skb_prp) { struct prp_rct *trailer = skb_get_PRP_rct(frame->skb_prp); if (trailer) { prp_set_lan_id(trailer, port); } else { WARN_ONCE(!trailer, "errored PRP skb"); return NULL; } return skb_clone(frame->skb_prp, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); } skb = skb_copy_expand(frame->skb_std, skb_headroom(frame->skb_std), skb_tailroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); return prp_fill_rct(skb, frame, port); } static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, struct hsr_node *node_src) { bool was_multicast_frame; int res, recv_len; was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST); hsr_addr_subst_source(node_src, skb); skb_pull(skb, ETH_HLEN); recv_len = skb->len; res = netif_rx(skb); if (res == NET_RX_DROP) { dev->stats.rx_dropped++; } else { dev->stats.rx_packets++; dev->stats.rx_bytes += recv_len; if (was_multicast_frame) dev->stats.multicast++; } } static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, struct hsr_frame_info *frame) { if (frame->port_rcv->type == HSR_PT_MASTER) { hsr_addr_subst_dest(frame->node_src, skb, port); /* Address substitution (IEC62439-3 pp 26, 50): replace mac * address of outgoing frame with that of the outgoing slave's. */ ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr); } /* When HSR node is used as RedBox - the frame received from HSR ring * requires source MAC address (SA) replacement to one which can be * recognized by SAN devices (otherwise, frames are dropped by switch) */ if (port->type == HSR_PT_INTERLINK) ether_addr_copy(eth_hdr(skb)->h_source, port->hsr->macaddress_redbox); return dev_queue_xmit(skb); } bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { return ((frame->port_rcv->type == HSR_PT_SLAVE_A && port->type == HSR_PT_SLAVE_B) || (frame->port_rcv->type == HSR_PT_SLAVE_B && port->type == HSR_PT_SLAVE_A)); } bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { struct sk_buff *skb; if (port->dev->features & NETIF_F_HW_HSR_FWD) return prp_drop_frame(frame, port); /* RedBox specific frames dropping policies * * Do not send HSR supervisory frames to SAN devices */ if (frame->is_supervision && port->type == HSR_PT_INTERLINK) return true; /* Do not forward to other HSR port (A or B) unicast frames which * are addressed to interlink port (and are in the ProxyNodeTable). */ skb = frame->skb_hsr; if (skb && prp_drop_frame(frame, port) && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest)) { return true; } /* Do not forward to port C (Interlink) frames from nodes A and B * if DA is in NodeTable. */ if ((frame->port_rcv->type == HSR_PT_SLAVE_A || frame->port_rcv->type == HSR_PT_SLAVE_B) && port->type == HSR_PT_INTERLINK) { skb = frame->skb_hsr; if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->node_db, eth_hdr(skb)->h_dest)) { return true; } } /* Do not forward to port A and B unicast frames received on the * interlink port if it is addressed to one of nodes registered in * the ProxyNodeTable. */ if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) && frame->port_rcv->type == HSR_PT_INTERLINK) { skb = frame->skb_std; if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest)) { return true; } } return false; } /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before * - if it's a PRP frame: through another PRP slave device (no bridge) * - To the local HSR master only if the frame is directly addressed to it, or * a non-supervision multicast or broadcast frame. * * HSR slave devices should insert a HSR tag into the frame, or forward the * frame unchanged if it's already tagged. Interlink devices should strip HSR * tags if they're of the non-HSR type (but only after duplicate discard). The * master device always strips HSR tags. */ static void hsr_forward_do(struct hsr_frame_info *frame) { struct hsr_port *port; struct sk_buff *skb; bool sent = false; hsr_for_each_port(frame->port_rcv->hsr, port) { struct hsr_priv *hsr = port->hsr; /* Don't send frame back the way it came */ if (port == frame->port_rcv) continue; /* Don't deliver locally unless we should */ if (port->type == HSR_PT_MASTER && !frame->is_local_dest) continue; /* Deliver frames directly addressed to us to master only */ if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; /* If hardware duplicate generation is enabled, only send out * one port. */ if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent) continue; /* Don't send frame over port where it has been sent before. * Also for SAN, this shouldn't be done. */ if (!frame->is_from_san && hsr_register_frame_out(port, frame->node_src, frame->sequence_nr)) continue; if (frame->is_supervision && port->type == HSR_PT_MASTER && !frame->is_proxy_supervision) { hsr_handle_sup_frame(frame); continue; } /* Check if frame is to be dropped. Eg. for PRP no forward * between ports, or sending HSR supervision to RedBox. */ if (hsr->proto_ops->drop_frame && hsr->proto_ops->drop_frame(frame, port)) continue; if (port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) skb = hsr->proto_ops->create_tagged_frame(frame, port); else skb = hsr->proto_ops->get_untagged_frame(frame, port); if (!skb) { frame->port_rcv->dev->stats.rx_dropped++; continue; } skb->dev = port->dev; if (port->type == HSR_PT_MASTER) { hsr_deliver_master(skb, port->dev, frame->node_src); } else { if (!hsr_xmit(skb, port, frame)) if (port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) sent = true; } } } static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, struct hsr_frame_info *frame) { if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) { frame->is_local_exclusive = true; skb->pkt_type = PACKET_HOST; } else { frame->is_local_exclusive = false; } if (skb->pkt_type == PACKET_HOST || skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) { frame->is_local_dest = true; } else { frame->is_local_dest = false; } } static void handle_std_frame(struct sk_buff *skb, struct hsr_frame_info *frame) { struct hsr_port *port = frame->port_rcv; struct hsr_priv *hsr = port->hsr; frame->skb_hsr = NULL; frame->skb_prp = NULL; frame->skb_std = skb; if (port->type != HSR_PT_MASTER) frame->is_from_san = true; if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { /* Sequence nr for the master/interlink node */ lockdep_assert_held(&hsr->seqnr_lock); frame->sequence_nr = hsr->sequence_nr; hsr->sequence_nr++; } } int hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame) { struct hsr_port *port = frame->port_rcv; struct hsr_priv *hsr = port->hsr; /* HSRv0 supervisory frames double as a tag so treat them as tagged. */ if ((!hsr->prot_version && proto == htons(ETH_P_PRP)) || proto == htons(ETH_P_HSR)) { /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return -EINVAL; /* HSR tagged frame :- Data or Supervision */ frame->skb_std = NULL; frame->skb_prp = NULL; frame->skb_hsr = skb; frame->sequence_nr = hsr_get_skb_sequence_nr(skb); return 0; } /* Standard frame or PRP from master port */ handle_std_frame(skb, frame); return 0; } int prp_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame) { /* Supervision frame */ struct prp_rct *rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, frame->is_supervision)) { frame->skb_hsr = NULL; frame->skb_std = NULL; frame->skb_prp = skb; frame->sequence_nr = prp_get_skb_sequence_nr(rct); return 0; } handle_std_frame(skb, frame); return 0; } static int fill_frame_info(struct hsr_frame_info *frame, struct sk_buff *skb, struct hsr_port *port) { struct hsr_priv *hsr = port->hsr; struct hsr_vlan_ethhdr *vlan_hdr; struct list_head *n_db; struct ethhdr *ethhdr; __be16 proto; int ret; /* Check if skb contains ethhdr */ if (skb->mac_len < sizeof(struct ethhdr)) return -EINVAL; memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); if (frame->is_supervision && hsr->redbox) frame->is_proxy_supervision = is_proxy_supervision_frame(port->hsr, skb); n_db = &hsr->node_db; if (port->type == HSR_PT_INTERLINK) n_db = &hsr->proxy_node_db; frame->node_src = hsr_get_node(port, n_db, skb, frame->is_supervision, port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ ethhdr = (struct ethhdr *)skb_mac_header(skb); frame->is_vlan = false; proto = ethhdr->h_proto; if (proto == htons(ETH_P_8021Q)) frame->is_vlan = true; if (frame->is_vlan) { if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) return -EINVAL; vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } frame->is_from_san = false; frame->port_rcv = port; ret = hsr->proto_ops->fill_frame_info(proto, skb, frame); if (ret) return ret; check_local_dest(port->hsr, skb, frame); return 0; } /* Must be called holding rcu read lock (because of the port parameter) */ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) { struct hsr_frame_info frame; rcu_read_lock(); if (fill_frame_info(&frame, skb, port) < 0) goto out_drop; hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); rcu_read_unlock(); /* Gets called for ingress frames as well as egress from master port. * So check and increment stats for master port only here. */ if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { port->dev->stats.tx_packets++; port->dev->stats.tx_bytes += skb->len; } kfree_skb(frame.skb_hsr); kfree_skb(frame.skb_prp); kfree_skb(frame.skb_std); return; out_drop: rcu_read_unlock(); port->dev->stats.tx_dropped++; kfree_skb(skb); } |
23 23 22 4 4 4 24 24 4 4 24 24 24 24 24 24 24 24 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 | // SPDX-License-Identifier: GPL-2.0 /* * event tracer * * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * * - Added format output of fields of the trace point. * This was based off of work by Tom Zanussi <tzanussi@gmail.com>. * */ #define pr_fmt(fmt) fmt #include <linux/workqueue.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> #include <linux/slab.h> #include <linux/delay.h> #include <trace/events/sched.h> #include <trace/syscall.h> #include <asm/setup.h> #include "trace_output.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); static bool eventdir_initialized; static LIST_HEAD(module_strings); struct module_string { struct list_head next; struct module *module; char *str; }; #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) static struct kmem_cache *field_cachep; static struct kmem_cache *file_cachep; static inline int system_refcount(struct event_subsystem *system) { return system->ref_count; } static int system_refcount_inc(struct event_subsystem *system) { return system->ref_count++; } static int system_refcount_dec(struct event_subsystem *system) { return --system->ref_count; } /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ list_for_each_entry(file, &tr->events, list) #define do_for_each_event_file_safe(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ struct trace_event_file *___n; \ list_for_each_entry_safe(file, ___n, &tr->events, list) #define while_for_each_event_file() \ } static struct ftrace_event_field * __find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; } return NULL; } struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name) { struct ftrace_event_field *field; struct list_head *head; head = trace_get_fields(call); field = __find_event_field(head, name); if (field) return field; field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type, int len) { struct ftrace_event_field *field; field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) return -ENOMEM; field->name = name; field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); else field->filter_type = filter_type; field->offset = offset; field->size = size; field->is_signed = is_signed; field->len = len; list_add(&field->link, head); return 0; } int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct list_head *head; if (WARN_ON(!call->class)) return 0; head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, is_signed, filter_type, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type, int len) { struct list_head *head; if (WARN_ON(!call->class)) return 0; head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, is_signed, filter_type, len); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ filter_type, 0); \ if (ret) \ return ret; #define __common_field(type, item) \ ret = __trace_define_field(&ftrace_common_fields, #type, \ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ is_signed_type(type), FILTER_OTHER, 0); \ if (ret) \ return ret; static int trace_define_generic_fields(void) { int ret; __generic_field(int, CPU, FILTER_CPU); __generic_field(int, cpu, FILTER_CPU); __generic_field(int, common_cpu, FILTER_CPU); __generic_field(char *, COMM, FILTER_COMM); __generic_field(char *, comm, FILTER_COMM); __generic_field(char *, stacktrace, FILTER_STACKTRACE); __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); return ret; } static int trace_define_common_fields(void) { int ret; struct trace_entry ent; __common_field(unsigned short, type); __common_field(unsigned char, flags); /* Holds both preempt_count and migrate_disable */ __common_field(unsigned char, preempt_count); __common_field(int, pid); return ret; } static void trace_destroy_fields(struct trace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); kmem_cache_free(field_cachep, field); } } /* * run-time version of trace_event_get_offsets_<call>() that returns the last * accessible offset of trace fields excluding __dynamic_array bytes */ int trace_event_get_offsets(struct trace_event_call *call) { struct ftrace_event_field *tail; struct list_head *head; head = trace_get_fields(call); /* * head->next points to the last field with the largest offset, * since it was added last by trace_define_field() */ tail = list_first_entry(head, struct ftrace_event_field, link); return tail->offset + tail->size; } /* * Check if the referenced field is an array and return true, * as arrays are OK to dereference. */ static bool test_field(const char *fmt, struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; const char *array_descriptor; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) return false; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') break; } len = p - fmt; for (; field->type; field++) { if (strncmp(field->name, fmt, len) || field->name[len]) continue; array_descriptor = strchr(field->type, '['); /* This is an array and is OK to dereference. */ return array_descriptor != NULL; } return false; } /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and * much later referenced after the pointer was freed. Dereferencing * pointers are OK, if it is dereferenced into the event itself. */ static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; bool first = true; const char *fmt, *c, *r, *a; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; int i; fmt = call->print_fmt; if (!fmt) return; for (i = 0; fmt[i]; i++) { switch (fmt[i]) { case '\\': i++; if (!fmt[i]) return; continue; case '"': case '\'': /* * The print fmt starts with a string that * is processed first to find %p* usage, * then after the first string, the print fmt * contains arguments that are used to check * if the dereferenced %p* usage is safe. */ if (first) { if (fmt[i] == '\'') continue; if (in_quote) { arg = 0; first = false; /* * If there was no %p* uses * the fmt is OK. */ if (!dereference_flags) return; } } if (in_quote) { if (in_quote == fmt[i]) in_quote = 0; } else { in_quote = fmt[i]; } continue; case '%': if (!first || !in_quote) continue; i++; if (!fmt[i]) return; switch (fmt[i]) { case '%': continue; case 'p': /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': case 'b': case 'M': case 'm': case 'I': case 'i': case 'E': case 'U': case 'V': case 'N': case 'a': case 'd': case 'D': case 'g': case 't': case 'C': case 'O': case 'f': if (WARN_ONCE(arg == 63, "Too many args for event: %s", trace_event_name(call))) return; dereference_flags |= 1ULL << arg; } break; default: { bool star = false; int j; /* Increment arg if %*s exists. */ for (j = 0; fmt[i + j]; j++) { if (isdigit(fmt[i + j]) || fmt[i + j] == '.') continue; if (fmt[i + j] == '*') { star = true; continue; } if ((fmt[i + j] == 's') && star) arg++; break; } break; } /* default */ } /* switch */ arg++; continue; case '(': if (in_quote) continue; parens++; continue; case ')': if (in_quote) continue; parens--; if (WARN_ONCE(parens < 0, "Paren mismatch for event: %s\narg='%s'\n%*s", trace_event_name(call), fmt + start_arg, (i - start_arg) + 5, "^")) return; continue; case ',': if (in_quote || parens) continue; i++; while (isspace(fmt[i])) i++; start_arg = i; if (!(dereference_flags & (1ULL << arg))) goto next_arg; /* Find the REC-> in the argument */ c = strchr(fmt + i, ','); r = strstr(fmt + i, "REC->"); if (r && (!c || r < c)) { /* * Addresses of events on the buffer, * or an array on the buffer is * OK to dereference. * There's ways to fool this, but * this is to catch common mistakes, * not malicious code. */ a = strchr(fmt + i, '&'); if ((a && (a < r)) || test_field(r, call)) dereference_flags &= ~(1ULL << arg); } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && (!c || r < c)) { dereference_flags &= ~(1ULL << arg); } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && (!c || r < c)) { dereference_flags &= ~(1ULL << arg); } next_arg: i--; arg++; } } /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored * at the trace event time may no longer exist when the trace * event is printed, dereferencing to the original source is * unsafe. The source of the dereference must be copied into the * event itself, and the dereference must access the copy instead. */ if (WARN_ON_ONCE(dereference_flags)) { arg = 1; while (!(dereference_flags & 1)) { dereference_flags >>= 1; arg++; } pr_warn("event %s has unsafe dereference of argument %d\n", trace_event_name(call), arg); pr_warn("print_fmt: %s\n", fmt); } } int trace_event_raw_init(struct trace_event_call *call) { int id; id = register_trace_event(&call->event); if (!id) return -ENODEV; test_event_printk(call); return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; struct trace_array_cpu *data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_raw(tr->filtered_pids); no_pid_list = rcu_dereference_raw(tr->filtered_no_pids); if (!pid_list && !no_pid_list) return false; data = this_cpu_ptr(tr->array_buffer.data); return data->ignore_pid; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) { struct trace_event_call *event_call = trace_file->event_call; if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && trace_event_ignore_this_pid(trace_file)) return NULL; /* * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment. */ fbuffer->trace_ctx = tracing_gen_ctx_dec(); fbuffer->trace_file = trace_file; fbuffer->event = trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, event_call->event.type, len, fbuffer->trace_ctx); if (!fbuffer->event) return NULL; fbuffer->regs = NULL; fbuffer->entry = ring_buffer_event_data(fbuffer->event); return fbuffer->entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); int trace_event_reg(struct trace_event_call *call, enum trace_reg type, void *data) { struct trace_event_file *file = data; WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); switch (type) { case TRACE_REG_REGISTER: return tracepoint_probe_register(call->tp, call->class->probe, file); case TRACE_REG_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->probe, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: return tracepoint_probe_register(call->tp, call->class->perf_probe, call); case TRACE_REG_PERF_UNREGISTER: tracepoint_probe_unregister(call->tp, call->class->perf_probe, call); return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: case TRACE_REG_PERF_DEL: return 0; #endif } return 0; } EXPORT_SYMBOL_GPL(trace_event_reg); void trace_event_enable_cmd_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); } void trace_event_enable_tgid_record(bool enable) { struct trace_event_file *file; struct trace_array *tr; lockdep_assert_held(&event_mutex); do_for_each_event_file(tr, file) { if (!(file->flags & EVENT_FILE_FL_ENABLED)) continue; if (enable) { tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } else { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } } while_for_each_event_file(); } static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; int ret = 0; int disable; switch (enable) { case 0: /* * When soft_disable is set and enable is cleared, the sm_ref * reference counter is decremented. If it reaches 0, we want * to clear the SOFT_DISABLED flag but leave the event in the * state that it was. That is, if the event was enabled and * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED * is set we do not want the event to be enabled before we * clear the bit. * * When soft_disable is not set but the SOFT_MODE flag is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); if (file->flags & EVENT_FILE_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { tracing_stop_tgid_record(); clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do * nothing (but set SOFT_MODE). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled. */ if (!soft_disable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break; set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { tgid = true; tracing_start_tgid_record(); set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { if (cmd) tracing_stop_cmdline_record(); if (tgid) tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; } set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); } break; } return ret; } int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { return __ftrace_event_enable_disable(file, enable, soft_disable); } static int ftrace_event_enable_disable(struct trace_event_file *file, int enable) { return __ftrace_event_enable_disable(file, enable, 0); } static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } mutex_unlock(&event_mutex); } static void event_filter_pid_sched_process_exit(void *data, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); pid_list = rcu_dereference_raw(tr->filtered_no_pids); trace_filter_add_remove_task(pid_list, NULL, task); } static void event_filter_pid_sched_process_fork(void *data, struct task_struct *self, struct task_struct *task) { struct trace_pid_list *pid_list; struct trace_array *tr = data; pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); pid_list = rcu_dereference_sched(tr->filtered_no_pids); trace_filter_add_remove_task(pid_list, self, task); } void trace_event_follow_fork(struct trace_array *tr, bool enable) { if (enable) { register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, tr, INT_MIN); register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, tr, INT_MAX); } else { unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, tr); unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, tr); } } static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; bool ret; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* * Sched switch is funny, as we only want to ignore it * in the notrace case if both prev and next should be ignored. */ ret = trace_ignore_this_task(NULL, no_pid_list, prev) && trace_ignore_this_task(NULL, no_pid_list, next); this_cpu_write(tr->array_buffer.data->ignore_pid, ret || (trace_ignore_this_task(pid_list, NULL, prev) && trace_ignore_this_task(pid_list, NULL, next))); } static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, next)); } static void event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are already tracing */ if (!this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, task)); } static void event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; /* Nothing to do if we are not tracing */ if (this_cpu_read(tr->array_buffer.data->ignore_pid)) return; pid_list = rcu_dereference_sched(tr->filtered_pids); no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); /* Set tracing if current is enabled */ this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, current)); } static void unregister_pid_events(struct trace_array *tr) { unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); } static void __ftrace_clear_event_pids(struct trace_array *tr, int type) { struct trace_pid_list *pid_list; struct trace_pid_list *no_pid_list; struct trace_event_file *file; int cpu; pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); /* Make sure there's something to do */ if (!pid_type_enabled(type, pid_list, no_pid_list)) return; if (!still_need_pid_events(type, pid_list, no_pid_list)) { unregister_pid_events(tr); list_for_each_entry(file, &tr->events, list) { clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } for_each_possible_cpu(cpu) per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; } if (type & TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, NULL); if (type & TRACE_NO_PIDS) rcu_assign_pointer(tr->filtered_no_pids, NULL); /* Wait till all users are no longer using pid filtering */ tracepoint_synchronize_unregister(); if ((type & TRACE_PIDS) && pid_list) trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) trace_pid_list_free(no_pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr, int type) { mutex_lock(&event_mutex); __ftrace_clear_event_pids(tr, type); mutex_unlock(&event_mutex); } static void __put_system(struct event_subsystem *system) { struct event_filter *filter = system->filter; WARN_ON_ONCE(system_refcount(system) == 0); if (system_refcount_dec(system)) return; list_del(&system->list); if (filter) { kfree(filter->filter_string); kfree(filter); } kfree_const(system->name); kfree(system); } static void __get_system(struct event_subsystem *system) { WARN_ON_ONCE(system_refcount(system) == 0); system_refcount_inc(system); } static void __get_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); dir->ref_count++; __get_system(dir->subsystem); } static void __put_system_dir(struct trace_subsystem_dir *dir) { WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */ WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); __put_system(dir->subsystem); if (!--dir->ref_count) kfree(dir); } static void put_system(struct trace_subsystem_dir *dir) { mutex_lock(&event_mutex); __put_system_dir(dir); mutex_unlock(&event_mutex); } static void remove_subsystem(struct trace_subsystem_dir *dir) { if (!dir) return; if (!--dir->nr_events) { eventfs_remove_dir(dir->ei); list_del(&dir->list); __put_system_dir(dir); } } void event_file_get(struct trace_event_file *file) { refcount_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; kmem_cache_free(file_cachep, file); } } static void remove_event_file_dir(struct trace_event_file *file) { eventfs_remove_dir(file->ei); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); file->flags |= EVENT_FILE_FL_FREED; event_file_put(file); } /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { struct trace_event_file *file; struct trace_event_call *call; const char *name; int ret = -EINVAL; int eret = 0; list_for_each_entry(file, &tr->events, list) { call = file->event_call; name = trace_event_name(call); if (!name || !call->class || !call->class->reg) continue; if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue; if (match && strcmp(match, name) != 0 && strcmp(match, call->class->system) != 0) continue; if (sub && strcmp(sub, call->class->system) != 0) continue; if (event && strcmp(event, name) != 0) continue; ret = ftrace_event_enable_disable(file, set); /* * Save the first error and return that. Some events * may still have been enabled, but let the user * know that something went wrong. */ if (ret && !eret) eret = ret; ret = eret; } return ret; } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, const char *sub, const char *event, int set) { int ret; mutex_lock(&event_mutex); ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); mutex_unlock(&event_mutex); return ret; } int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; int ret; if (!tr) return -ENOENT; /* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. * :<event-name> is the same. * * <subsystem>:* means all events in that subsystem * <subsystem>: means the same. * * <name> (no ':') means all events in a subsystem with * the name <name> or any event that matches <name> */ match = strsep(&buf, ":"); if (buf) { sub = match; event = buf; match = NULL; if (!strlen(sub) || strcmp(sub, "*") == 0) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; } ret = __ftrace_set_clr_event(tr, match, sub, event, set); /* Put back the colon to allow this to be called again */ if (buf) *(buf - 1) = ':'; return ret; } /** * trace_set_clr_event - enable or disable an event * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @set: 1 to enable, 0 to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events. */ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); if (!tr) return -ENODEV; return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); /** * trace_array_set_clr_event - enable or disable an event for a trace array. * @tr: concerned trace array. * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @enable: true to enable, false to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events. */ int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable) { int set; if (!tr) return -ENOENT; set = (enable == true) ? 1 : 0; return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_array_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; struct seq_file *m = file->private_data; struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) return 0; ret = tracing_update_buffers(tr); if (ret < 0) return ret; if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { int set = 1; if (*parser.buffer == '!') set = 0; ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } ret = read; out_put: trace_parser_put(&parser); return ret; } static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = v; struct trace_event_call *call; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } return NULL; } static void *t_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = t_next(m, file, &l); if (!file) break; } return file; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = v; struct trace_array *tr = m->private; (*pos)++; list_for_each_entry_continue(file, &tr->events, list) { if (file->flags & EVENT_FILE_FL_ENABLED) return file; } return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); file = list_entry(&tr->events, struct trace_event_file, list); for (l = 0; l <= *pos; ) { file = s_next(m, file, &l); if (!file) break; } return file; } static int t_show(struct seq_file *m, void *v) { struct trace_event_file *file = v; struct trace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); seq_printf(m, "%s\n", trace_event_name(call)); return 0; } static void t_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } static void * __next(struct seq_file *m, void *v, loff_t *pos, int type) { struct trace_array *tr = m->private; struct trace_pid_list *pid_list; if (type == TRACE_PIDS) pid_list = rcu_dereference_sched(tr->filtered_pids); else pid_list = rcu_dereference_sched(tr->filtered_no_pids); return trace_pid_next(pid_list, v, pos); } static void * p_next(struct seq_file *m, void *v, loff_t *pos) { return __next(m, v, pos, TRACE_PIDS); } static void * np_next(struct seq_file *m, void *v, loff_t *pos) { return __next(m, v, pos, TRACE_NO_PIDS); } static void *__start(struct seq_file *m, loff_t *pos, int type) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; /* * Grab the mutex, to keep calls to p_next() having the same * tr->filtered_pids as p_start() has. * If we just passed the tr->filtered_pids around, then RCU would * have been enough, but doing that makes things more complex. */ mutex_lock(&event_mutex); rcu_read_lock_sched(); if (type == TRACE_PIDS) pid_list = rcu_dereference_sched(tr->filtered_pids); else pid_list = rcu_dereference_sched(tr->filtered_no_pids); if (!pid_list) return NULL; return trace_pid_start(pid_list, pos); } static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { return __start(m, pos, TRACE_PIDS); } static void *np_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { return __start(m, pos, TRACE_NO_PIDS); } static void p_stop(struct seq_file *m, void *p) __releases(RCU) { rcu_read_unlock_sched(); mutex_unlock(&event_mutex); } static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long flags; char buf[4] = "0"; mutex_lock(&event_mutex); file = event_file_file(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); if (flags & EVENT_FILE_FL_SOFT_DISABLED || flags & EVENT_FILE_FL_SOFT_MODE) strcat(buf, "*"); strcat(buf, "\n"); return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; unsigned long val; int ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; switch (val) { case 0: case 1: ret = -ENODEV; mutex_lock(&event_mutex); file = event_file_file(filp); if (likely(file)) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); return ret; } ret = ftrace_event_enable_disable(file, val); } mutex_unlock(&event_mutex); break; default: return -EINVAL; } *ppos += cnt; return ret ? ret : cnt; } static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; struct trace_array *tr = dir->tr; char buf[2]; int set = 0; int ret; mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) continue; /* * We need to find out if all the events are set * or if all events or cleared, or if we have * a mixture. */ set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED)); /* * If we have a mixture, no need to look further. */ if (set == 3) break; } mutex_unlock(&event_mutex); buf[0] = set_to_char[set]; buf[1] = '\n'; ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); return ret; } static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; ret = tracing_update_buffers(dir->tr); if (ret < 0) return ret; if (val != 0 && val != 1) return -EINVAL; /* * Opening of "enable" adds a ref count to system, * so the name is safe to use. */ if (system) name = system->name; ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); if (ret) goto out; ret = cnt; out: *ppos += cnt; return ret; } enum { FORMAT_HEADER = 1, FORMAT_FIELD_SEPERATOR = 2, FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_event_file *file = event_file_data(m->private); struct trace_event_call *call = file->event_call; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: node = common_head; break; case FORMAT_FIELD_SEPERATOR: node = head; break; case FORMAT_PRINTFMT: /* all done */ return NULL; } node = node->prev; if (node == common_head) return (void *)FORMAT_FIELD_SEPERATOR; else if (node == head) return (void *)FORMAT_PRINTFMT; else return node; } static int f_show(struct seq_file *m, void *v) { struct trace_event_file *file = event_file_data(m->private); struct trace_event_call *call = file->event_call; struct ftrace_event_field *field; const char *array_descriptor; switch ((unsigned long)v) { case FORMAT_HEADER: seq_printf(m, "name: %s\n", trace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); seq_puts(m, "format:\n"); return 0; case FORMAT_FIELD_SEPERATOR: seq_putc(m, '\n'); return 0; case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } field = list_entry(v, struct ftrace_event_field, link); /* * Smartly shows the array type(except dynamic array). * Normal: * field:TYPE VAR * If TYPE := TYPE[LEN], it is shown: * field:TYPE VAR[LEN] */ array_descriptor = strchr(field->type, '['); if (str_has_prefix(field->type, "__data_loc")) array_descriptor = NULL; if (!array_descriptor) seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); else if (field->len) seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, field->len, field->offset, field->size, !!field->is_signed); else seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, field->offset, field->size, !!field->is_signed); return 0; } static void *f_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *file; void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); file = event_file_file(m->private); if (!file) return ERR_PTR(-ENODEV); while (l < *pos && p) p = f_next(m, p, &l); return p; } static void f_stop(struct seq_file *m, void *p) { mutex_unlock(&event_mutex); } static const struct seq_operations trace_format_seq_ops = { .start = f_start, .next = f_next, .stop = f_stop, .show = f_show, }; static int trace_format_open(struct inode *inode, struct file *file) { struct seq_file *m; int ret; /* Do we want to hide event format files on tracefs lockdown? */ ret = seq_open(file, &trace_format_seq_ops); if (ret < 0) return ret; m = file->private_data; m->private = file; return 0; } #ifdef CONFIG_PERF_EVENTS static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { int id = (long)event_file_data(filp); char buf[32]; int len; if (unlikely(!id)) return -ENODEV; len = sprintf(buf, "%d\n", id); return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } #endif static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; struct trace_seq *s; int r = -ENODEV; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); mutex_lock(&event_mutex); file = event_file_file(filp); if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); if (file) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_event_file *file; char *buf; int err = -ENODEV; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_file(filp); if (file) { if (file->flags & EVENT_FILE_FL_FREED) err = -ENODEV; else err = apply_event_filter(file, buf); } mutex_unlock(&event_mutex); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct trace_subsystem_dir *dir = NULL, *iter_dir; struct trace_array *tr = NULL, *iter_tr; struct event_subsystem *system = NULL; int ret; if (tracing_is_disabled()) return -ENODEV; /* Make sure the system still exists */ mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { list_for_each_entry(iter_dir, &iter_tr->systems, list) { if (iter_dir == inode->i_private) { /* Don't open systems with no events */ tr = iter_tr; dir = iter_dir; if (dir->nr_events) { __get_system_dir(dir); system = dir->subsystem; } goto exit_loop; } } } exit_loop: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); if (!system) return -ENODEV; /* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) { put_system(dir); return -ENODEV; } ret = tracing_open_generic(inode, filp); if (ret < 0) { trace_array_put(tr); put_system(dir); } return ret; } static int system_tr_open(struct inode *inode, struct file *filp) { struct trace_subsystem_dir *dir; struct trace_array *tr = inode->i_private; int ret; /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); if (!dir) return -ENOMEM; ret = tracing_open_generic_tr(inode, filp); if (ret < 0) { kfree(dir); return ret; } dir->tr = tr; filp->private_data = dir; return 0; } static int subsystem_release(struct inode *inode, struct file *file) { struct trace_subsystem_dir *dir = file->private_data; trace_array_put(dir->tr); /* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable * all subsystems. */ if (dir->subsystem) put_system(dir); else kfree(dir); return 0; } static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_subsystem_dir *dir = filp->private_data; char *buf; int err; if (cnt >= PAGE_SIZE) return -EINVAL; buf = memdup_user_nul(ubuf, cnt); if (IS_ERR(buf)) return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); kfree(buf); if (err < 0) return err; *ppos += cnt; return cnt; } static ssize_t show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); ring_buffer_print_page_header(tr->array_buffer.buffer, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static ssize_t show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_seq *s; int r; if (*ppos) return 0; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); ring_buffer_print_entry_header(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); kfree(s); return r; } static void ignore_task_cpu(void *data) { struct trace_array *tr = data; struct trace_pid_list *pid_list; struct trace_pid_list *no_pid_list; /* * This function is called by on_each_cpu() while the * event_mutex is held. */ pid_list = rcu_dereference_protected(tr->filtered_pids, mutex_is_locked(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, mutex_is_locked(&event_mutex)); this_cpu_write(tr->array_buffer.data->ignore_pid, trace_ignore_this_task(pid_list, no_pid_list, current)); } static void register_pid_events(struct trace_array *tr) { /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. * Register a probe this is called after all other probes * to only keep ignore_pid set if next pid matches. */ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, tr, INT_MAX); register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, tr, 0); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr, 0); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr, INT_MAX); register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr, 0); } static ssize_t event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos, int type) { struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; struct trace_pid_list *other_pids = NULL; struct trace_pid_list *pid_list; struct trace_event_file *file; ssize_t ret; if (!cnt) return 0; ret = tracing_update_buffers(tr); if (ret < 0) return ret; mutex_lock(&event_mutex); if (type == TRACE_PIDS) { filtered_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); other_pids = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); } else { filtered_pids = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); other_pids = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) goto out; if (type == TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, pid_list); else rcu_assign_pointer(tr->filtered_no_pids, pid_list); list_for_each_entry(file, &tr->events, list) { set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } if (filtered_pids) { tracepoint_synchronize_unregister(); trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { register_pid_events(tr); } /* * Ignoring of pids is done at task switch. But we have to * check for those tasks that are currently running. * Always do this in case a pid was appended or removed. */ on_each_cpu(ignore_task_cpu, tr, 1); out: mutex_unlock(&event_mutex); if (ret > 0) *ppos += ret; return ret; } static ssize_t ftrace_event_pid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); } static ssize_t ftrace_event_npid_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); } static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, .show = t_show, .stop = t_stop, }; static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, .show = t_show, .stop = t_stop, }; static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, .show = trace_pid_show, .stop = p_stop, }; static const struct seq_operations show_set_no_pid_seq_ops = { .start = np_start, .next = np_next, .show = trace_pid_show, .stop = p_stop, }; static const struct file_operations ftrace_avail_fops = { .open = ftrace_event_avail_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static const struct file_operations ftrace_set_event_fops = { .open = ftrace_event_set_open, .read = seq_read, .write = ftrace_event_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_set_event_pid_fops = { .open = ftrace_event_set_pid_open, .read = seq_read, .write = ftrace_event_pid_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_set_event_notrace_pid_fops = { .open = ftrace_event_set_npid_open, .read = seq_read, .write = ftrace_event_npid_write, .llseek = seq_lseek, .release = ftrace_event_release, }; static const struct file_operations ftrace_enable_fops = { .open = tracing_open_file_tr, .read = event_enable_read, .write = event_enable_write, .release = tracing_release_file_tr, .llseek = default_llseek, }; static const struct file_operations ftrace_event_format_fops = { .open = trace_format_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; #ifdef CONFIG_PERF_EVENTS static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; #endif static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, .read = event_filter_read, .write = event_filter_write, .release = tracing_release_file_tr, .llseek = default_llseek, }; static const struct file_operations ftrace_subsystem_filter_fops = { .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_tr_enable_fops = { .open = system_tr_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, .release = subsystem_release, }; static const struct file_operations ftrace_show_header_page_fops = { .open = tracing_open_generic_tr, .read = show_header_page_file, .llseek = default_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations ftrace_show_header_event_fops = { .open = tracing_open_generic_tr, .read = show_header_event_file, .llseek = default_llseek, .release = tracing_release_generic_tr, }; static int ftrace_event_open(struct inode *inode, struct file *file, const struct seq_operations *seq_ops) { struct seq_file *m; int ret; ret = security_locked_down(LOCKDOWN_TRACEFS); if (ret) return ret; ret = seq_open(file, seq_ops); if (ret < 0) return ret; m = file->private_data; /* copy tr over to seq ops */ m->private = inode->i_private; return ret; } static int ftrace_event_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; trace_array_put(tr); return seq_release(inode, file); } static int ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; /* Checks for tracefs lockdown */ return ftrace_event_open(inode, file, seq_ops); } static int ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_events(tr); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static int ftrace_event_set_pid_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_pid_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_event_pids(tr, TRACE_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static int ftrace_event_set_npid_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops; struct trace_array *tr = inode->i_private; int ret; ret = tracing_check_open_get_tr(tr); if (ret) return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) ftrace_clear_event_pids(tr, TRACE_NO_PIDS); ret = ftrace_event_open(inode, file, seq_ops); if (ret < 0) trace_array_put(tr); return ret; } static struct event_subsystem * create_new_subsystem(const char *name) { struct event_subsystem *system; /* need to create new entry */ system = kmalloc(sizeof(*system), GFP_KERNEL); if (!system) return NULL; system->ref_count = 1; /* Only allocate if dynamic (kprobes and modules) */ system->name = kstrdup_const(name, GFP_KERNEL); if (!system->name) goto out_free; system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); if (!system->filter) goto out_free; list_add(&system->list, &event_subsystems); return system; out_free: kfree_const(system->name); kfree(system); return NULL; } static int system_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "filter") == 0) *fops = &ftrace_subsystem_filter_fops; else if (strcmp(name, "enable") == 0) *fops = &ftrace_system_enable_fops; else return 0; *mode = TRACE_MODE_WRITE; return 1; } static struct eventfs_inode * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct eventfs_inode *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; struct eventfs_inode *ei; int nr_entries; static struct eventfs_entry system_entries[] = { { .name = "filter", .callback = system_callback, }, { .name = "enable", .callback = system_callback, } }; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; return dir->ei; } } /* Now see if the system itself exists. */ system = NULL; list_for_each_entry(iter, &event_subsystems, list) { if (strcmp(iter->name, name) == 0) { system = iter; break; } } dir = kmalloc(sizeof(*dir), GFP_KERNEL); if (!dir) goto out_fail; if (!system) { system = create_new_subsystem(name); if (!system) goto out_free; } else __get_system(system); /* ftrace only has directories no files */ if (strcmp(name, "ftrace") == 0) nr_entries = 0; else nr_entries = ARRAY_SIZE(system_entries); ei = eventfs_create_dir(name, parent, system_entries, nr_entries, dir); if (IS_ERR(ei)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } dir->ei = ei; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; dir->subsystem = system; file->system = dir; list_add(&dir->list, &tr->systems); return dir->ei; out_free: kfree(dir); out_fail: /* Only print this message if failed on memory allocation */ if (!dir || !system) pr_warn("No memory to create event subsystem %s\n", name); return NULL; } static int event_define_fields(struct trace_event_call *call) { struct list_head *head; int ret = 0; /* * Other events may have the same class. Only update * the fields if they are not already defined. */ head = trace_get_fields(call); if (list_empty(head)) { struct trace_event_fields *field = call->class->fields_array; unsigned int offset = sizeof(struct trace_entry); for (; field->type; field++) { if (field->type == TRACE_FUNCTION_TYPE) { field->define_fields(call); break; } offset = ALIGN(offset, field->align); ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, field->len); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; } offset += field->size; } } return ret; } static int event_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { struct trace_event_file *file = *data; struct trace_event_call *call = file->event_call; if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; return 1; } /* * Only event directories that can be enabled should have * triggers or filters, with the exception of the "print" * event that can have a "trigger" file. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { if (call->class->reg && strcmp(name, "enable") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_enable_fops; return 1; } if (strcmp(name, "filter") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_event_filter_fops; return 1; } } if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || strcmp(trace_event_name(call), "print") == 0) { if (strcmp(name, "trigger") == 0) { *mode = TRACE_MODE_WRITE; *fops = &event_trigger_fops; return 1; } } #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg && strcmp(name, "id") == 0) { *mode = TRACE_MODE_READ; *data = (void *)(long)call->event.type; *fops = &ftrace_event_id_fops; return 1; } #endif #ifdef CONFIG_HIST_TRIGGERS if (strcmp(name, "hist") == 0) { *mode = TRACE_MODE_READ; *fops = &event_hist_fops; return 1; } #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG if (strcmp(name, "hist_debug") == 0) { *mode = TRACE_MODE_READ; *fops = &event_hist_debug_fops; return 1; } #endif #ifdef CONFIG_TRACE_EVENT_INJECT if (call->event.type && call->class->reg && strcmp(name, "inject") == 0) { *mode = 0200; *fops = &event_inject_fops; return 1; } #endif return 0; } /* The file is incremented on creation and freeing the enable file decrements it */ static void event_release(const char *name, void *data) { struct trace_event_file *file = data; event_file_put(file); } static int event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; struct eventfs_inode *e_events; struct eventfs_inode *ei; const char *name; int nr_entries; int ret; static struct eventfs_entry event_entries[] = { { .name = "enable", .callback = event_callback, .release = event_release, }, { .name = "filter", .callback = event_callback, }, { .name = "trigger", .callback = event_callback, }, { .name = "format", .callback = event_callback, }, #ifdef CONFIG_PERF_EVENTS { .name = "id", .callback = event_callback, }, #endif #ifdef CONFIG_HIST_TRIGGERS { .name = "hist", .callback = event_callback, }, #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG { .name = "hist_debug", .callback = event_callback, }, #endif #ifdef CONFIG_TRACE_EVENT_INJECT { .name = "inject", .callback = event_callback, }, #endif }; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". This should * never happen. */ if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) return -ENODEV; e_events = event_subsystem_dir(tr, call->class->system, file, parent); if (!e_events) return -ENOMEM; nr_entries = ARRAY_SIZE(event_entries); name = trace_event_name(call); ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); if (IS_ERR(ei)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } file->ei = ei; ret = event_define_fields(call); if (ret < 0) { pr_warn("Could not initialize trace point events/%s\n", name); return ret; } /* Gets decremented on freeing of the "enable" file */ event_file_get(file); return 0; } static void remove_event_from_tracers(struct trace_event_call *call) { struct trace_event_file *file; struct trace_array *tr; do_for_each_event_file_safe(tr, file) { if (file->event_call != call) continue; remove_event_file_dir(file); /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); } static void event_remove(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; do_for_each_event_file(tr, file) { if (file->event_call != call) continue; if (file->flags & EVENT_FILE_FL_WAS_ENABLED) tr->clear_trace = true; ftrace_event_enable_disable(file, 0); /* * The do_for_each_event_file() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); if (call->event.funcs) __unregister_trace_event(&call->event); remove_event_from_tracers(call); list_del(&call->list); } static int event_init(struct trace_event_call *call) { int ret = 0; const char *name; name = trace_event_name(call); if (WARN_ON(!name)) return -EINVAL; if (call->class->raw_init) { ret = call->class->raw_init(call); if (ret < 0 && ret != -ENOSYS) pr_warn("Could not initialize trace events/%s\n", name); } return ret; } static int __register_event(struct trace_event_call *call, struct module *mod) { int ret; ret = event_init(call); if (ret < 0) return ret; list_add(&call->list, &ftrace_events); if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else call->module = mod; return 0; } static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; /* Find the length of the eval value as a string */ elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); memmove(ptr + elen, ptr + len, rlen); /* Make sure we end the new string */ ptr[elen + rlen] = 0; return ptr + elen; } static void update_event_printk(struct trace_event_call *call, struct trace_eval_map *map) { char *ptr; int quote = 0; int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { ptr++; /* paranoid */ if (!*ptr) break; continue; } if (*ptr == '"') { quote ^= 1; continue; } if (quote) continue; if (isdigit(*ptr)) { /* skip numbers */ do { ptr++; /* Check for alpha chars like ULL */ } while (isalnum(*ptr)); if (!*ptr) break; /* * A number must have some kind of delimiter after * it, and we can ignore that too. */ continue; } if (isalpha(*ptr) || *ptr == '_') { if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { ptr = eval_replace(ptr, map, len); /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* * No need to decrement here, as eval_replace() * returns the pointer to the character passed * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ continue; } skip_more: do { ptr++; } while (isalnum(*ptr) || *ptr == '_'); if (!*ptr) break; /* * If what comes after this variable is a '.' or * '->' then we can continue to ignore that string. */ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { ptr += *ptr == '.' ? 1 : 2; if (!*ptr) break; goto skip_more; } /* * Once again, we can skip the delimiter that came * after the string. */ continue; } } } static void add_str_to_module(struct module *module, char *str) { struct module_string *modstr; modstr = kmalloc(sizeof(*modstr), GFP_KERNEL); /* * If we failed to allocate memory here, then we'll just * let the str memory leak when the module is removed. * If this fails to allocate, there's worse problems than * a leaked string on module removal. */ if (WARN_ON_ONCE(!modstr)) return; modstr->module = module; modstr->str = str; list_add(&modstr->next, &module_strings); } static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; struct list_head *head; char *ptr; char *str; int len = strlen(map->eval_string); /* Dynamic events should never have field maps */ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) return; head = trace_get_fields(call); list_for_each_entry(field, head, link) { ptr = strchr(field->type, '['); if (!ptr) continue; ptr++; if (!isalpha(*ptr) && *ptr != '_') continue; if (strncmp(map->eval_string, ptr, len) != 0) continue; str = kstrdup(field->type, GFP_KERNEL); if (WARN_ON_ONCE(!str)) return; ptr = str + (ptr - field->type); ptr = eval_replace(ptr, map, len); /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) { kfree(str); continue; } /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated * until a reboot. */ if (call->module) add_str_to_module(call->module, str); field->type = str; } } void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; int last_i; int i; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { first = true; last_i = 0; last_system = call->class->system; } /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call * was from the same system. That's what last_i is for. If the * call has the same system as the previous call, then last_i * will be the index of the first map[] that has a matching * system. */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ if (first) { last_i = i; first = false; } update_event_printk(call, map[i]); update_event_fields(call, map[i]); } } cond_resched(); } up_write(&trace_event_sem); } static bool event_in_systems(struct trace_event_call *call, const char *systems) { const char *system; const char *p; if (!systems) return true; system = call->class->system; p = strstr(systems, system); if (!p) return false; if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') return false; p += strlen(system); return !*p || isspace(*p) || *p == ','; } static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; struct trace_event_file *file; unsigned int first; if (!event_in_systems(call, tr->system_names)) return NULL; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, lockdep_is_held(&event_mutex)); if (!trace_pid_list_first(pid_list, &first) || !trace_pid_list_first(no_pid_list, &first)) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); refcount_set(&file->ref, 1); return file; } #define MAX_BOOT_TRIGGERS 32 static struct boot_triggers { const char *event; char *trigger; } bootup_triggers[MAX_BOOT_TRIGGERS]; static char bootup_trigger_buf[COMMAND_LINE_SIZE]; static int nr_boot_triggers; static __init int setup_trace_triggers(char *str) { char *trigger; char *buf; int i; strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); buf = bootup_trigger_buf; for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { trigger = strsep(&buf, ","); if (!trigger) break; bootup_triggers[i].event = strsep(&trigger, "."); bootup_triggers[i].trigger = trigger; if (!bootup_triggers[i].trigger) break; } nr_boot_triggers = i; return 1; } __setup("trace_trigger=", setup_trace_triggers); /* Add an event to a trace directory */ static int __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; file = trace_create_new_event(call, tr); /* * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed * allocation, or NULL if the event is not part of the tr->system_names. * When the event is not part of the tr->system_names, return zero, not * an error. */ if (!file) return 0; if (IS_ERR(file)) return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); else return event_define_fields(call); } static void trace_early_triggers(struct trace_event_file *file, const char *name) { int ret; int i; for (i = 0; i < nr_boot_triggers; i++) { if (strcmp(name, bootup_triggers[i].event)) continue; mutex_lock(&event_mutex); ret = trigger_process_regex(file, bootup_triggers[i].trigger); mutex_unlock(&event_mutex); if (ret) pr_err("Failed to register trigger '%s' on event %s\n", bootup_triggers[i].trigger, bootup_triggers[i].event); } } /* * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ static int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { struct trace_event_file *file; int ret; file = trace_create_new_event(call, tr); /* * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed * allocation, or NULL if the event is not part of the tr->system_names. * When the event is not part of the tr->system_names, return zero, not * an error. */ if (!file) return 0; if (IS_ERR(file)) return PTR_ERR(file); ret = event_define_fields(call); if (ret) return ret; trace_early_triggers(file, trace_event_name(call)); return 0; } struct ftrace_module_file_ops; static void __add_event_to_tracers(struct trace_event_call *call); /* Add an additional event_call dynamically */ int trace_add_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); mutex_lock(&trace_types_lock); ret = __register_event(call, NULL); if (ret >= 0) __add_event_to_tracers(call); mutex_unlock(&trace_types_lock); return ret; } EXPORT_SYMBOL_GPL(trace_add_event_call); /* * Must be called under locking of trace_types_lock, event_mutex and * trace_event_sem. */ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); } static int probe_remove_event_call(struct trace_event_call *call) { struct trace_array *tr; struct trace_event_file *file; #ifdef CONFIG_PERF_EVENTS if (call->perf_refcount) return -EBUSY; #endif do_for_each_event_file(tr, file) { if (file->event_call != call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) goto busy; if (file->flags & EVENT_FILE_FL_WAS_ENABLED) tr->clear_trace = true; /* * The do_for_each_event_file_safe() is * a double loop. After finding the call for this * trace_array, we use break to jump to the next * trace_array. */ break; } while_for_each_event_file(); __trace_remove_event_call(call); return 0; busy: /* No need to clear the trace now */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { tr->clear_trace = false; } return -EBUSY; } /* Remove an event_call */ int trace_remove_event_call(struct trace_event_call *call) { int ret; lockdep_assert_held(&event_mutex); mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); mutex_unlock(&trace_types_lock); return ret; } EXPORT_SYMBOL_GPL(trace_remove_event_call); #define for_each_event(event, start, end) \ for (event = start; \ (unsigned long)event < (unsigned long)end; \ event++) #ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { struct trace_event_call **call, **start, **end; if (!mod->num_trace_events) return; /* Don't add infrastructure for mods without tracepoints */ if (trace_module_has_bad_taint(mod)) { pr_err("%s: module has bad taint, not creating trace events\n", mod->name); return; } start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; for_each_event(call, start, end) { __register_event(*call, mod); __add_event_to_tracers(*call); } } static void trace_module_remove_events(struct module *mod) { struct trace_event_call *call, *p; struct module_string *modstr, *m; down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) continue; if (call->module == mod) __trace_remove_event_call(call); } /* Check for any strings allocade for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; list_del(&modstr->next); kfree(modstr->str); kfree(modstr); } up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded * registered any events that were used. The only worry is if * a new module gets loaded, and takes on the same id as the events * of this module. When printing out the buffer, traced events left * over from this module may be passed to the new module events and * unexpected results may occur. */ tracing_reset_all_online_cpus_unlocked(); } static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); switch (val) { case MODULE_STATE_COMING: trace_module_add_events(mod); break; case MODULE_STATE_GOING: trace_module_remove_events(mod); break; } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); return NOTIFY_OK; } static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 1, /* higher than trace.c module notify */ }; #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ static void __trace_add_event_dirs(struct trace_array *tr) { struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(call)); } } /* Returns any file that matches the system and event */ struct trace_event_file * __find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; struct trace_event_call *call; const char *name; list_for_each_entry(file, &tr->events, list) { call = file->event_call; name = trace_event_name(call); if (!name || !call->class) continue; if (strcmp(event, name) == 0 && strcmp(system, call->class->system) == 0) return file; } return NULL; } /* Returns valid trace event files that match system and event */ struct trace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; file = __find_event_file(tr, system, event); if (!file || !file->event_call->class->reg || file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) return NULL; return file; } /** * trace_get_event_file - Find and return a trace event file * @instance: The name of the trace instance containing the event * @system: The name of the system containing the event * @event: The name of the event * * Return a trace event file given the trace instance name, trace * system, and trace event name. If the instance name is NULL, it * refers to the top-level trace array. * * This function will look it up and return it if found, after calling * trace_array_get() to prevent the instance from going away, and * increment the event's module refcount to prevent it from being * removed. * * To release the file, call trace_put_event_file(), which will call * trace_array_put() and decrement the event's module refcount. * * Return: The trace event on success, ERR_PTR otherwise. */ struct trace_event_file *trace_get_event_file(const char *instance, const char *system, const char *event) { struct trace_array *tr = top_trace_array(); struct trace_event_file *file = NULL; int ret = -EINVAL; if (instance) { tr = trace_array_find_get(instance); if (!tr) return ERR_PTR(-ENOENT); } else { ret = trace_array_get(tr); if (ret) return ERR_PTR(ret); } mutex_lock(&event_mutex); file = find_event_file(tr, system, event); if (!file) { trace_array_put(tr); ret = -EINVAL; goto out; } /* Don't let event modules unload while in use */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); ret = -EBUSY; goto out; } ret = 0; out: mutex_unlock(&event_mutex); if (ret) file = ERR_PTR(ret); return file; } EXPORT_SYMBOL_GPL(trace_get_event_file); /** * trace_put_event_file - Release a file from trace_get_event_file() * @file: The trace event file * * If a file was retrieved using trace_get_event_file(), this should * be called when it's no longer needed. It will cancel the previous * trace_array_get() called by that function, and decrement the * event's module refcount. */ void trace_put_event_file(struct trace_event_file *file) { mutex_lock(&event_mutex); trace_event_put_ref(file->event_call); mutex_unlock(&event_mutex); trace_array_put(file->tr); } EXPORT_SYMBOL_GPL(trace_put_event_file); #ifdef CONFIG_DYNAMIC_FTRACE /* Avoid typos */ #define ENABLE_EVENT_STR "enable_event" #define DISABLE_EVENT_STR "disable_event" struct event_probe_data { struct trace_event_file *file; unsigned long count; int ref; bool enable; }; static void update_event_probe(struct event_probe_data *data) { if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); } static void event_enable_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; update_event_probe(edata); } static void event_enable_count_probe(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (!pdata || !*pdata) return; edata = *pdata; if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; if (edata->count != -1) (edata->count)--; update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; void **pdata; pdata = ftrace_func_mapper_find_ip(mapper, ip); if (WARN_ON_ONCE(!pdata || !*pdata)) return 0; edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, edata->file->event_call->class->system, trace_event_name(edata->file->event_call)); if (edata->count == -1) seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data) { struct ftrace_func_mapper *mapper = *data; struct event_probe_data *edata = init_data; int ret; if (!mapper) { mapper = allocate_ftrace_func_mapper(); if (!mapper) return -ENODEV; *data = mapper; } ret = ftrace_func_mapper_add_ip(mapper, ip, edata); if (ret < 0) return ret; edata->ref++; return 0; } static int free_probe_data(void *data) { struct event_probe_data *edata = data; edata->ref--; if (!edata->ref) { /* Remove the SOFT_MODE flag */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); } return 0; } static void event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data) { struct ftrace_func_mapper *mapper = data; struct event_probe_data *edata; if (!ip) { if (!mapper) return; free_ftrace_func_mapper(mapper, free_probe_data); return; } edata = ftrace_func_mapper_remove_ip(mapper, ip); if (WARN_ON_ONCE(!edata)) return; if (WARN_ON_ONCE(edata->ref <= 0)) return; free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_enable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_probe_ops = { .func = event_enable_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static struct ftrace_probe_ops event_disable_count_probe_ops = { .func = event_enable_count_probe, .print = event_enable_print, .init = event_enable_init, .free = event_enable_free, }; static int event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; const char *system; const char *event; char *number; bool enable; int ret; if (!tr) return -ENODEV; /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; system = strsep(¶m, ":"); if (!param) return -EINVAL; event = strsep(¶m, ":"); mutex_lock(&event_mutex); ret = -EINVAL; file = find_event_file(tr, system, event); if (!file) goto out; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; if (enable) ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); goto out; } ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; data->enable = enable; data->count = -1; data->file = file; if (!param) goto out_reg; number = strsep(¶m, ":"); ret = -EINVAL; if (!strlen(number)) goto out_free; /* * We use the callback data field (which is a pointer) * as our counter. */ ret = kstrtoul(number, 0, &data->count); if (ret) goto out_free; out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { ret = -EBUSY; goto out_free; } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ if (!ret) { ret = -ENOENT; goto out_disable; } else if (ret < 0) goto out_disable; /* Just return zero, not the number of enabled functions */ ret = 0; out: mutex_unlock(&event_mutex); return ret; out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); out_free: kfree(data); goto out; } static struct ftrace_func_command event_enable_cmd = { .name = ENABLE_EVENT_STR, .func = event_enable_func, }; static struct ftrace_func_command event_disable_cmd = { .name = DISABLE_EVENT_STR, .func = event_enable_func, }; static __init int register_event_cmds(void) { int ret; ret = register_ftrace_command(&event_enable_cmd); if (WARN_ON(ret < 0)) return ret; ret = register_ftrace_command(&event_disable_cmd); if (WARN_ON(ret < 0)) unregister_ftrace_command(&event_enable_cmd); return ret; } #else static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* * The top level array and trace arrays created by boot-time tracing * have already had its trace_event_file descriptors created in order * to allow for early events to be recorded. * This function is called after the tracefs has been initialized, * and we now have to create the files associated to the events. */ static void __trace_early_add_event_dirs(struct trace_array *tr) { struct trace_event_file *file; int ret; list_for_each_entry(file, &tr->events, list) { ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warn("Could not create directory for event %s\n", trace_event_name(file->event_call)); } } /* * For early boot up, the top trace array and the trace arrays created * by boot-time tracing require to have a list of events that can be * enabled. This must be done before the filesystem is set up in order * to allow events to be traced early. */ void __trace_early_add_events(struct trace_array *tr) { struct trace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && WARN_ON_ONCE(call->module)) continue; ret = __trace_early_add_new_event(call, tr); if (ret < 0) pr_warn("Could not create early event %s\n", trace_event_name(call)); } } /* Remove the event directory structure for a trace directory. */ static void __trace_remove_event_dirs(struct trace_array *tr) { struct trace_event_file *file, *next; list_for_each_entry_safe(file, next, &tr->events, list) remove_event_file_dir(file); } static void __add_event_to_tracers(struct trace_event_call *call) { struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) __trace_add_new_event(call, tr); } extern struct trace_event_call *__start_ftrace_events[]; extern struct trace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); return 1; } __setup("trace_event=", setup_trace_event); static int events_callback(const char *name, umode_t *mode, void **data, const struct file_operations **fops) { if (strcmp(name, "enable") == 0) { *mode = TRACE_MODE_WRITE; *fops = &ftrace_tr_enable_fops; return 1; } if (strcmp(name, "header_page") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_show_header_page_fops; } else if (strcmp(name, "header_event") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_show_header_event_fops; } else return 0; return 1; } /* Expects to have event_mutex held when called */ static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct eventfs_inode *e_events; struct dentry *entry; int nr_entries; static struct eventfs_entry events_entries[] = { { .name = "enable", .callback = events_callback, }, { .name = "header_page", .callback = events_callback, }, { .name = "header_event", .callback = events_callback, }, }; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; nr_entries = ARRAY_SIZE(events_entries); e_events = eventfs_create_events_dir("events", parent, events_entries, nr_entries, tr); if (IS_ERR(e_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } /* There are not as crucial, just warn if they are not created */ trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_pid_fops); trace_create_file("set_event_notrace_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_notrace_pid_fops); tr->event_dir = e_events; return 0; } /** * event_trace_add_tracer - add a instance of a trace_array to events * @parent: The parent dentry to place the files/directories for events in * @tr: The trace array associated with these events * * When a new instance is created, it needs to set up its events * directory, as well as other files associated with events. It also * creates the event hierarchy in the @parent/events directory. * * Returns 0 on success. * * Must be called with event_mutex held. */ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; lockdep_assert_held(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) goto out; down_write(&trace_event_sem); /* If tr already has the event list, it is initialized in early boot. */ if (unlikely(!list_empty(&tr->events))) __trace_early_add_event_dirs(tr); else __trace_add_event_dirs(tr); up_write(&trace_event_sem); out: return ret; } /* * The top trace array already had its file descriptors created. * Now the files themselves need to be created. */ static __init int early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; mutex_lock(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) goto out_unlock; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); out_unlock: mutex_unlock(&event_mutex); return ret; } /* Must be called with event_mutex held */ int event_trace_del_tracer(struct trace_array *tr) { lockdep_assert_held(&event_mutex); /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); /* Clear the pid list */ __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); down_write(&trace_event_sem); __trace_remove_event_dirs(tr); eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; return 0; } static __init int event_trace_memsetup(void) { field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC); return 0; } __init void early_enable_events(struct trace_array *tr, char *buf, bool disable_first) { char *token; int ret; while (true) { token = strsep(&buf, ","); if (!token) break; if (*token) { /* Restarting syscalls requires that we stop them first */ if (disable_first) ftrace_set_clr_event(tr, token, 0); ret = ftrace_set_clr_event(tr, token, 1); if (ret) pr_warn("Failed to enable trace event: %s\n", token); } /* Put back the comma to allow this to be called again */ if (buf) *(buf - 1) = ','; } } static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); struct trace_event_call **iter, *call; int ret; if (!tr) return -ENODEV; for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; ret = event_init(call); if (!ret) list_add(&call->list, &ftrace_events); } register_trigger_cmds(); /* * We need the top trace array to have a working set of trace * points at early init, before the debug files and directories * are created. Create the file entries now, and attach them * to the actual file dentries later. */ __trace_early_add_events(tr); early_enable_events(tr, bootup_event_buf, false); trace_printk_start_comm(); register_event_cmds(); return 0; } /* * event_trace_enable() is called from trace_event_init() first to * initialize events and perhaps start any events that are on the * command line. Unfortunately, there are some events that will not * start this early, like the system call tracepoints that need * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But * event_trace_enable() is called before pid 1 starts, and this flag * is never set, making the syscall tracepoint never get reached, but * the event is enabled regardless (and not doing anything). */ static __init int event_trace_enable_again(void) { struct trace_array *tr; tr = top_trace_array(); if (!tr) return -ENODEV; early_enable_events(tr, bootup_event_buf, true); return 0; } early_initcall(event_trace_enable_again); /* Init fields which doesn't related to the tracefs */ static __init int event_trace_init_fields(void) { if (trace_define_generic_fields()) pr_warn("tracing: Failed to allocated generic fields"); if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); return 0; } __init int event_trace_init(void) { struct trace_array *tr; int ret; tr = top_trace_array(); if (!tr) return -ENODEV; trace_create_file("available_events", TRACE_MODE_READ, NULL, tr, &ftrace_avail_fops); ret = early_event_add_tracer(NULL, tr); if (ret) return ret; #ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) pr_warn("Failed to register trace events module notifier\n"); #endif eventdir_initialized = true; return 0; } void __init trace_event_init(void) { event_trace_memsetup(); init_ftrace_syscalls(); event_trace_enable(); event_trace_init_fields(); } #ifdef CONFIG_EVENT_TRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); static DEFINE_SPINLOCK(test_spinlock_irq); static DEFINE_MUTEX(test_mutex); static __init void test_work(struct work_struct *dummy) { spin_lock(&test_spinlock); spin_lock_irq(&test_spinlock_irq); udelay(1); spin_unlock_irq(&test_spinlock_irq); spin_unlock(&test_spinlock); mutex_lock(&test_mutex); msleep(1); mutex_unlock(&test_mutex); } static __init int event_test_thread(void *unused) { void *test_malloc; test_malloc = kmalloc(1234, GFP_KERNEL); if (!test_malloc) pr_info("failed to kmalloc\n"); schedule_on_each_cpu(test_work); kfree(test_malloc); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; } /* * Do various things that may trigger events. */ static __init void event_test_stuff(void) { struct task_struct *test_thread; test_thread = kthread_run(event_test_thread, NULL, "test-events"); msleep(1); kthread_stop(test_thread); } /* * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ static __init void event_trace_self_tests(void) { struct trace_subsystem_dir *dir; struct trace_event_file *file; struct trace_event_call *call; struct event_subsystem *system; struct trace_array *tr; int ret; tr = top_trace_array(); if (!tr) return; pr_info("Running tests on trace events:\n"); list_for_each_entry(file, &tr->events, list) { call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) continue; /* * Testing syscall events here is pretty useless, but * we still do it if configured. But this is time consuming. * What we really need is a user thread to perform the * syscalls as we test. */ #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS if (call->class->system && strcmp(call->class->system, "syscalls") == 0) continue; #endif pr_info("Testing event %s: ", trace_event_name(call)); /* * If an event is already enabled, someone is using * it and the self test should not be on. */ if (file->flags & EVENT_FILE_FL_ENABLED) { pr_warn("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } ftrace_event_enable_disable(file, 1); event_test_stuff(); ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } /* Now test at the sub system level */ pr_info("Running tests on trace event systems:\n"); list_for_each_entry(dir, &tr->systems, list) { system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) continue; pr_info("Testing event system %s: ", system->name); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); continue; } event_test_stuff(); ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); continue; } pr_cont("OK\n"); } /* Test with all events enabled */ pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; } event_test_stuff(); /* reset sysname */ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; } pr_cont("OK\n"); } #ifdef CONFIG_FUNCTION_TRACER static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *regs) { struct trace_buffer *buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; unsigned int trace_ctx; long disabled; int cpu; trace_ctx = tracing_gen_ctx(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, TRACE_FN, sizeof(*entry), trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; event_trigger_unlock_commit(&event_trace_file, buffer, event, entry, trace_ctx); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, }; static __init void event_trace_self_test_with_function(void) { int ret; event_trace_file.tr = top_trace_array(); if (WARN_ON(!event_trace_file.tr)) return; ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); return; } pr_info("Running tests again, along with the function tracer\n"); event_trace_self_tests(); unregister_ftrace_function(&trace_ops); } #else static __init void event_trace_self_test_with_function(void) { } #endif static __init int event_trace_self_tests_init(void) { if (!tracing_selftest_disabled) { event_trace_self_tests(); event_trace_self_test_with_function(); } return 0; } late_initcall(event_trace_self_tests_init); #endif |
28 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; void (*init)(struct team *team, struct team_option_inst_info *info); void (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; struct mutex lock; /* used for overall locking, e.g. port lists write */ /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */ |
2 2 2 2 52 34 5 14 16 46 8 5 2 1 1 2 1 2 1 1 1 1 9 2 1 1 1 1 1 1 1 4 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 12 1 1 1 1 1 1 1 2 3 2 1 1 1 1 1 1 2 2 1 1 6 6 1 4 2 4 3 1 2 1 1 2 1 2 3 3 5 5 5 5 20 37 37 2 2 3 1 2 3 1 4 4 1 1 1 1 1 2 1 1 2 2 8 3 9 1 8 1 3 3 1 1 1 2 1 2 6 1 5 3 1 4 4 1 1 1 1 4 2 2 2 2 6 8 1 7 6 6 6 1 2 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 | // SPDX-License-Identifier: GPL-2.0-only /* * Netlink interface for IEEE 802.15.4 stack * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Maxim Osipov <maxim.osipov@siemens.com> */ #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/ieee802154.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/sock.h> #include <linux/nl802154.h> #include <linux/export.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> #include <net/cfg802154.h> #include "ieee802154.h" static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr, int padattr) { return nla_put_u64_64bit(msg, type, swab64((__force u64)hwaddr), padattr); } static __le64 nla_get_hwaddr(const struct nlattr *nla) { return ieee802154_devaddr_from_raw(nla_data(nla)); } static int nla_put_shortaddr(struct sk_buff *msg, int type, __le16 addr) { return nla_put_u16(msg, type, le16_to_cpu(addr)); } static __le16 nla_get_shortaddr(const struct nlattr *nla) { return cpu_to_le16(nla_get_u16(nla)); } static int ieee802154_nl_start_confirm(struct net_device *dev, u8 status) { struct sk_buff *msg; pr_debug("%s\n", __func__); msg = ieee802154_nl_create(0, IEEE802154_START_CONF); if (!msg) return -ENOBUFS; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || nla_put_u8(msg, IEEE802154_ATTR_STATUS, status)) goto nla_put_failure; return ieee802154_nl_mcast(msg, IEEE802154_COORD_MCGRP); nla_put_failure: nlmsg_free(msg); return -ENOBUFS; } static int ieee802154_nl_fill_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct net_device *dev) { void *hdr; struct wpan_phy *phy; struct ieee802154_mlme_ops *ops; __le16 short_addr, pan_id; pr_debug("%s\n", __func__); hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, IEEE802154_LIST_IFACE); if (!hdr) goto out; ops = ieee802154_mlme_ops(dev); phy = dev->ieee802154_ptr->wpan_phy; BUG_ON(!phy); get_device(&phy->dev); rtnl_lock(); short_addr = dev->ieee802154_ptr->short_addr; pan_id = dev->ieee802154_ptr->pan_id; rtnl_unlock(); if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put(msg, IEEE802154_ATTR_HW_ADDR, IEEE802154_ADDR_LEN, dev->dev_addr) || nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, short_addr) || nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, pan_id)) goto nla_put_failure; if (ops->get_mac_params) { struct ieee802154_mac_params params; rtnl_lock(); ops->get_mac_params(dev, ¶ms); rtnl_unlock(); if (nla_put_s8(msg, IEEE802154_ATTR_TXPOWER, params.transmit_power / 100) || nla_put_u8(msg, IEEE802154_ATTR_LBT_ENABLED, params.lbt) || nla_put_u8(msg, IEEE802154_ATTR_CCA_MODE, params.cca.mode) || nla_put_s32(msg, IEEE802154_ATTR_CCA_ED_LEVEL, params.cca_ed_level / 100) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_RETRIES, params.csma_retries) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_MIN_BE, params.min_be) || nla_put_u8(msg, IEEE802154_ATTR_CSMA_MAX_BE, params.max_be) || nla_put_s8(msg, IEEE802154_ATTR_FRAME_RETRIES, params.frame_retries)) goto nla_put_failure; } wpan_phy_put(phy); genlmsg_end(msg, hdr); return 0; nla_put_failure: wpan_phy_put(phy); genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } /* Requests from userspace */ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) { struct net_device *dev; if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { char name[IFNAMSIZ + 1]; nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], sizeof(name)); dev = dev_get_by_name(&init_net, name); } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) { dev = dev_get_by_index(&init_net, nla_get_u32(info->attrs[IEEE802154_ATTR_DEV_INDEX])); } else { return NULL; } if (!dev) return NULL; if (dev->type != ARPHRD_IEEE802154) { dev_put(dev); return NULL; } return dev; } int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; u8 page; int ret = -EOPNOTSUPP; if (!info->attrs[IEEE802154_ATTR_CHANNEL] || !info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || (!info->attrs[IEEE802154_ATTR_COORD_HW_ADDR] && !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]) || !info->attrs[IEEE802154_ATTR_CAPABILITY]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->assoc_req) goto out; if (info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]) { addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_COORD_HW_ADDR]); } else { addr.mode = IEEE802154_ADDR_SHORT; addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); } addr.pan_id = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]), page, nla_get_u8(info->attrs[IEEE802154_ATTR_CAPABILITY])); out: dev_put(dev); return ret; } int ieee802154_associate_resp(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; int ret = -EOPNOTSUPP; if (!info->attrs[IEEE802154_ATTR_STATUS] || !info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] || !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->assoc_resp) goto out; addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); rtnl_lock(); addr.pan_id = dev->ieee802154_ptr->pan_id; rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->assoc_resp(dev, &addr, nla_get_shortaddr(info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]), nla_get_u8(info->attrs[IEEE802154_ATTR_STATUS])); out: dev_put(dev); return ret; } int ieee802154_disassociate_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; int ret = -EOPNOTSUPP; if ((!info->attrs[IEEE802154_ATTR_DEST_HW_ADDR] && !info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]) || !info->attrs[IEEE802154_ATTR_REASON]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->disassoc_req) goto out; if (info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]) { addr.mode = IEEE802154_ADDR_LONG; addr.extended_addr = nla_get_hwaddr( info->attrs[IEEE802154_ATTR_DEST_HW_ADDR]); } else { addr.mode = IEEE802154_ADDR_SHORT; addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_DEST_SHORT_ADDR]); } rtnl_lock(); addr.pan_id = dev->ieee802154_ptr->pan_id; rtnl_unlock(); ret = ieee802154_mlme_ops(dev)->disassoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_REASON])); out: dev_put(dev); return ret; } /* PANid, channel, beacon_order = 15, superframe_order = 15, * PAN_coordinator, battery_life_extension = 0, * coord_realignment = 0, security_enable = 0 */ int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct ieee802154_addr addr; u8 channel, bcn_ord, sf_ord; u8 page; int pan_coord, blx, coord_realign; int ret = -EBUSY; if (!info->attrs[IEEE802154_ATTR_COORD_PAN_ID] || !info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR] || !info->attrs[IEEE802154_ATTR_CHANNEL] || !info->attrs[IEEE802154_ATTR_BCN_ORD] || !info->attrs[IEEE802154_ATTR_SF_ORD] || !info->attrs[IEEE802154_ATTR_PAN_COORD] || !info->attrs[IEEE802154_ATTR_BAT_EXT] || !info->attrs[IEEE802154_ATTR_COORD_REALIGN] ) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (netif_running(dev)) goto out; if (!ieee802154_mlme_ops(dev)->start_req) { ret = -EOPNOTSUPP; goto out; } addr.mode = IEEE802154_ADDR_SHORT; addr.short_addr = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_SHORT_ADDR]); addr.pan_id = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); channel = nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]); bcn_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_BCN_ORD]); sf_ord = nla_get_u8(info->attrs[IEEE802154_ATTR_SF_ORD]); pan_coord = nla_get_u8(info->attrs[IEEE802154_ATTR_PAN_COORD]); blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]); coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]); page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); if (addr.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS); dev_put(dev); return -EINVAL; } rtnl_lock(); ret = ieee802154_mlme_ops(dev)->start_req(dev, &addr, channel, page, bcn_ord, sf_ord, pan_coord, blx, coord_realign); rtnl_unlock(); /* FIXME: add validation for unused parameters to be sane * for SoftMAC */ ieee802154_nl_start_confirm(dev, IEEE802154_SUCCESS); out: dev_put(dev); return ret; } int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; int ret = -EOPNOTSUPP; u8 type; u32 channels; u8 duration; u8 page; if (!info->attrs[IEEE802154_ATTR_SCAN_TYPE] || !info->attrs[IEEE802154_ATTR_CHANNELS] || !info->attrs[IEEE802154_ATTR_DURATION]) return -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->scan_req) goto out; type = nla_get_u8(info->attrs[IEEE802154_ATTR_SCAN_TYPE]); channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]); duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]); page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page, duration); out: dev_put(dev); return ret; } int ieee802154_list_iface(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, * PAN Id, short address */ struct sk_buff *msg; struct net_device *dev = NULL; int rc = -ENOBUFS; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out_dev; rc = ieee802154_nl_fill_iface(msg, info->snd_portid, info->snd_seq, 0, dev); if (rc < 0) goto out_free; dev_put(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_dev: dev_put(dev); return rc; } int ieee802154_dump_iface(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *dev; int idx; int s_idx = cb->args[0]; pr_debug("%s\n", __func__); idx = 0; for_each_netdev(net, dev) { if (idx < s_idx || dev->type != ARPHRD_IEEE802154) goto cont; if (ieee802154_nl_fill_iface(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, dev) < 0) break; cont: idx++; } cb->args[0] = idx; return skb->len; } int ieee802154_set_macparams(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev = NULL; struct ieee802154_mlme_ops *ops; struct ieee802154_mac_params params; struct wpan_phy *phy; int rc = -EINVAL; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; ops = ieee802154_mlme_ops(dev); if (!ops->get_mac_params || !ops->set_mac_params) { rc = -EOPNOTSUPP; goto out; } if (netif_running(dev)) { rc = -EBUSY; goto out; } if (!info->attrs[IEEE802154_ATTR_LBT_ENABLED] && !info->attrs[IEEE802154_ATTR_CCA_MODE] && !info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL] && !info->attrs[IEEE802154_ATTR_CSMA_RETRIES] && !info->attrs[IEEE802154_ATTR_CSMA_MIN_BE] && !info->attrs[IEEE802154_ATTR_CSMA_MAX_BE] && !info->attrs[IEEE802154_ATTR_FRAME_RETRIES]) goto out; phy = dev->ieee802154_ptr->wpan_phy; get_device(&phy->dev); rtnl_lock(); ops->get_mac_params(dev, ¶ms); if (info->attrs[IEEE802154_ATTR_TXPOWER]) params.transmit_power = nla_get_s8(info->attrs[IEEE802154_ATTR_TXPOWER]) * 100; if (info->attrs[IEEE802154_ATTR_LBT_ENABLED]) params.lbt = nla_get_u8(info->attrs[IEEE802154_ATTR_LBT_ENABLED]); if (info->attrs[IEEE802154_ATTR_CCA_MODE]) params.cca.mode = nla_get_u8(info->attrs[IEEE802154_ATTR_CCA_MODE]); if (info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) params.cca_ed_level = nla_get_s32(info->attrs[IEEE802154_ATTR_CCA_ED_LEVEL]) * 100; if (info->attrs[IEEE802154_ATTR_CSMA_RETRIES]) params.csma_retries = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_RETRIES]); if (info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]) params.min_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MIN_BE]); if (info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]) params.max_be = nla_get_u8(info->attrs[IEEE802154_ATTR_CSMA_MAX_BE]); if (info->attrs[IEEE802154_ATTR_FRAME_RETRIES]) params.frame_retries = nla_get_s8(info->attrs[IEEE802154_ATTR_FRAME_RETRIES]); rc = ops->set_mac_params(dev, ¶ms); rtnl_unlock(); wpan_phy_put(phy); dev_put(dev); return 0; out: dev_put(dev); return rc; } static int ieee802154_llsec_parse_key_id(struct genl_info *info, struct ieee802154_llsec_key_id *desc) { memset(desc, 0, sizeof(*desc)); if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) return -EINVAL; desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]); if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) { if (!info->attrs[IEEE802154_ATTR_PAN_ID]) return -EINVAL; desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]); if (info->attrs[IEEE802154_ATTR_SHORT_ADDR]) { desc->device_addr.mode = IEEE802154_ADDR_SHORT; desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]); } else { if (!info->attrs[IEEE802154_ATTR_HW_ADDR]) return -EINVAL; desc->device_addr.mode = IEEE802154_ADDR_LONG; desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); } } if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]) return -EINVAL; if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]) return -EINVAL; if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]) return -EINVAL; if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT) desc->id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_ID]); switch (desc->mode) { case IEEE802154_SCF_KEY_SHORT_INDEX: { u32 source = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT]); desc->short_source = cpu_to_le32(source); break; } case IEEE802154_SCF_KEY_HW_INDEX: desc->extended_source = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED]); break; } return 0; } static int ieee802154_llsec_fill_key_id(struct sk_buff *msg, const struct ieee802154_llsec_key_id *desc) { if (nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_MODE, desc->mode)) return -EMSGSIZE; if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) { if (nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->device_addr.pan_id)) return -EMSGSIZE; if (desc->device_addr.mode == IEEE802154_ADDR_SHORT && nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, desc->device_addr.short_addr)) return -EMSGSIZE; if (desc->device_addr.mode == IEEE802154_ADDR_LONG && nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->device_addr.extended_addr, IEEE802154_ATTR_PAD)) return -EMSGSIZE; } if (desc->mode != IEEE802154_SCF_KEY_IMPLICIT && nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_ID, desc->id)) return -EMSGSIZE; if (desc->mode == IEEE802154_SCF_KEY_SHORT_INDEX && nla_put_u32(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_SHORT, le32_to_cpu(desc->short_source))) return -EMSGSIZE; if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX && nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED, desc->extended_source, IEEE802154_ATTR_PAD)) return -EMSGSIZE; return 0; } int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct net_device *dev = NULL; int rc = -ENOBUFS; struct ieee802154_mlme_ops *ops; void *hdr; struct ieee802154_llsec_params params; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; ops = ieee802154_mlme_ops(dev); if (!ops->llsec) { rc = -EOPNOTSUPP; goto out_dev; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out_dev; hdr = genlmsg_put(msg, 0, info->snd_seq, &nl802154_family, 0, IEEE802154_LLSEC_GETPARAMS); if (!hdr) goto out_free; rc = ops->llsec->get_params(dev, ¶ms); if (rc < 0) goto out_free; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_ENABLED, params.enabled) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, be32_to_cpu(params.frame_counter)) || ieee802154_llsec_fill_key_id(msg, ¶ms.out_key)) { rc = -ENOBUFS; goto out_free; } dev_put(dev); return ieee802154_nl_reply(msg, info); out_free: nlmsg_free(msg); out_dev: dev_put(dev); return rc; } int ieee802154_llsec_setparams(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev = NULL; int rc = -EINVAL; struct ieee802154_mlme_ops *ops; struct ieee802154_llsec_params params; int changed = 0; pr_debug("%s\n", __func__); dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!info->attrs[IEEE802154_ATTR_LLSEC_ENABLED] && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE] && !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) goto out; ops = ieee802154_mlme_ops(dev); if (!ops->llsec) { rc = -EOPNOTSUPP; goto out; } if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL] && nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) > 7) goto out; if (info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]) { params.enabled = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_ENABLED]); changed |= IEEE802154_LLSEC_PARAM_ENABLED; } if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]) { if (ieee802154_llsec_parse_key_id(info, ¶ms.out_key)) goto out; changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; } if (info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]) { params.out_level = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVEL]); changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; } if (info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]) { u32 fc = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); params.frame_counter = cpu_to_be32(fc); changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; } rc = ops->llsec->set_params(dev, ¶ms, changed); dev_put(dev); return rc; out: dev_put(dev); return rc; } struct llsec_dump_data { struct sk_buff *skb; int s_idx, s_idx2; int portid; int nlmsg_seq; struct net_device *dev; struct ieee802154_mlme_ops *ops; struct ieee802154_llsec_table *table; }; static int ieee802154_llsec_dump_table(struct sk_buff *skb, struct netlink_callback *cb, int (*step)(struct llsec_dump_data *)) { struct net *net = sock_net(skb->sk); struct net_device *dev; struct llsec_dump_data data; int idx = 0; int first_dev = cb->args[0]; int rc; for_each_netdev(net, dev) { if (idx < first_dev || dev->type != ARPHRD_IEEE802154) goto skip; data.ops = ieee802154_mlme_ops(dev); if (!data.ops->llsec) goto skip; data.skb = skb; data.s_idx = cb->args[1]; data.s_idx2 = cb->args[2]; data.dev = dev; data.portid = NETLINK_CB(cb->skb).portid; data.nlmsg_seq = cb->nlh->nlmsg_seq; data.ops->llsec->lock_table(dev); data.ops->llsec->get_table(data.dev, &data.table); rc = step(&data); data.ops->llsec->unlock_table(dev); if (rc < 0) break; skip: idx++; } cb->args[0] = idx; return skb->len; } static int ieee802154_nl_llsec_change(struct sk_buff *skb, struct genl_info *info, int (*fn)(struct net_device*, struct genl_info*)) { struct net_device *dev = NULL; int rc = -EINVAL; dev = ieee802154_nl_get_dev(info); if (!dev) return -ENODEV; if (!ieee802154_mlme_ops(dev)->llsec) rc = -EOPNOTSUPP; else rc = fn(dev, info); dev_put(dev); return rc; } static int ieee802154_llsec_parse_key(struct genl_info *info, struct ieee802154_llsec_key *key) { u8 frames; u32 commands[256 / 32]; memset(key, 0, sizeof(*key)); if (!info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES] || !info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES]) return -EINVAL; frames = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES]); if ((frames & BIT(IEEE802154_FC_TYPE_MAC_CMD)) && !info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) return -EINVAL; if (info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS]) { nla_memcpy(commands, info->attrs[IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS], 256 / 8); if (commands[0] || commands[1] || commands[2] || commands[3] || commands[4] || commands[5] || commands[6] || commands[7] >= BIT(IEEE802154_CMD_GTS_REQ + 1)) return -EINVAL; key->cmd_frame_ids = commands[7]; } key->frame_types = frames; nla_memcpy(key->key, info->attrs[IEEE802154_ATTR_LLSEC_KEY_BYTES], IEEE802154_LLSEC_KEY_SIZE); return 0; } static int llsec_add_key(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_key key; struct ieee802154_llsec_key_id id; if (ieee802154_llsec_parse_key(info, &key) || ieee802154_llsec_parse_key_id(info, &id)) return -EINVAL; return ops->llsec->add_key(dev, &id, &key); } int ieee802154_llsec_add_key(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_key); } static int llsec_remove_key(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_key_id id; if (ieee802154_llsec_parse_key_id(info, &id)) return -EINVAL; return ops->llsec->del_key(dev, &id); } int ieee802154_llsec_del_key(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_remove_key); } static int ieee802154_nl_fill_key(struct sk_buff *msg, u32 portid, u32 seq, const struct ieee802154_llsec_key_entry *key, const struct net_device *dev) { void *hdr; u32 commands[256 / 32]; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_KEY); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || ieee802154_llsec_fill_key_id(msg, &key->id) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_FRAME_TYPES, key->key->frame_types)) goto nla_put_failure; if (key->key->frame_types & BIT(IEEE802154_FC_TYPE_MAC_CMD)) { memset(commands, 0, sizeof(commands)); commands[7] = key->key->cmd_frame_ids; if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_USAGE_COMMANDS, sizeof(commands), commands)) goto nla_put_failure; } if (nla_put(msg, IEEE802154_ATTR_LLSEC_KEY_BYTES, IEEE802154_LLSEC_KEY_SIZE, key->key->key)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_keys(struct llsec_dump_data *data) { struct ieee802154_llsec_key_entry *pos; int rc = 0, idx = 0; list_for_each_entry(pos, &data->table->keys, list) { if (idx++ < data->s_idx) continue; if (ieee802154_nl_fill_key(data->skb, data->portid, data->nlmsg_seq, pos, data->dev)) { rc = -EMSGSIZE; break; } data->s_idx++; } return rc; } int ieee802154_llsec_dump_keys(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_keys); } static int llsec_parse_dev(struct genl_info *info, struct ieee802154_llsec_device *dev) { memset(dev, 0, sizeof(*dev)); if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] || !info->attrs[IEEE802154_ATTR_HW_ADDR] || !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] || !info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] || (!!info->attrs[IEEE802154_ATTR_PAN_ID] != !!info->attrs[IEEE802154_ATTR_SHORT_ADDR])) return -EINVAL; if (info->attrs[IEEE802154_ATTR_PAN_ID]) { dev->pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]); dev->short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]); } else { dev->short_addr = cpu_to_le16(IEEE802154_ADDR_UNDEF); } dev->hwaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); dev->frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); dev->seclevel_exempt = !!nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]); dev->key_mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE]); if (dev->key_mode >= __IEEE802154_LLSEC_DEVKEY_MAX) return -EINVAL; return 0; } static int llsec_add_dev(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_device desc; if (llsec_parse_dev(info, &desc)) return -EINVAL; return ops->llsec->add_dev(dev, &desc); } int ieee802154_llsec_add_dev(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_dev); } static int llsec_del_dev(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); __le64 devaddr; if (!info->attrs[IEEE802154_ATTR_HW_ADDR]) return -EINVAL; devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); return ops->llsec->del_dev(dev, devaddr); } int ieee802154_llsec_del_dev(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_del_dev); } static int ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq, const struct ieee802154_llsec_device *desc, const struct net_device *dev) { void *hdr; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_DEV); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) || nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, desc->short_addr) || nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr, IEEE802154_ATTR_PAD) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, desc->frame_counter) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE, desc->seclevel_exempt) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_KEY_MODE, desc->key_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_devs(struct llsec_dump_data *data) { struct ieee802154_llsec_device *pos; int rc = 0, idx = 0; list_for_each_entry(pos, &data->table->devices, list) { if (idx++ < data->s_idx) continue; if (ieee802154_nl_fill_dev(data->skb, data->portid, data->nlmsg_seq, pos, data->dev)) { rc = -EMSGSIZE; break; } data->s_idx++; } return rc; } int ieee802154_llsec_dump_devs(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devs); } static int llsec_add_devkey(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_device_key key; __le64 devaddr; if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER] || !info->attrs[IEEE802154_ATTR_HW_ADDR] || ieee802154_llsec_parse_key_id(info, &key.key_id)) return -EINVAL; devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); key.frame_counter = nla_get_u32(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_COUNTER]); return ops->llsec->add_devkey(dev, devaddr, &key); } int ieee802154_llsec_add_devkey(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_devkey); } static int llsec_del_devkey(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_device_key key; __le64 devaddr; if (!info->attrs[IEEE802154_ATTR_HW_ADDR] || ieee802154_llsec_parse_key_id(info, &key.key_id)) return -EINVAL; devaddr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]); return ops->llsec->del_devkey(dev, devaddr, &key); } int ieee802154_llsec_del_devkey(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_del_devkey); } static int ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq, __le64 devaddr, const struct ieee802154_llsec_device_key *devkey, const struct net_device *dev) { void *hdr; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_DEVKEY); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr, IEEE802154_ATTR_PAD) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, devkey->frame_counter) || ieee802154_llsec_fill_key_id(msg, &devkey->key_id)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_devkeys(struct llsec_dump_data *data) { struct ieee802154_llsec_device *dpos; struct ieee802154_llsec_device_key *kpos; int idx = 0, idx2; list_for_each_entry(dpos, &data->table->devices, list) { if (idx++ < data->s_idx) continue; idx2 = 0; list_for_each_entry(kpos, &dpos->keys, list) { if (idx2++ < data->s_idx2) continue; if (ieee802154_nl_fill_devkey(data->skb, data->portid, data->nlmsg_seq, dpos->hwaddr, kpos, data->dev)) { return -EMSGSIZE; } data->s_idx2++; } data->s_idx++; } return 0; } int ieee802154_llsec_dump_devkeys(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_devkeys); } static int llsec_parse_seclevel(struct genl_info *info, struct ieee802154_llsec_seclevel *sl) { memset(sl, 0, sizeof(*sl)); if (!info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE] || !info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS] || !info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]) return -EINVAL; sl->frame_type = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_FRAME_TYPE]); if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD) { if (!info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]) return -EINVAL; sl->cmd_frame_id = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_CMD_FRAME_ID]); } sl->sec_levels = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_SECLEVELS]); sl->device_override = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE]); return 0; } static int llsec_add_seclevel(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_seclevel sl; if (llsec_parse_seclevel(info, &sl)) return -EINVAL; return ops->llsec->add_seclevel(dev, &sl); } int ieee802154_llsec_add_seclevel(struct sk_buff *skb, struct genl_info *info) { if ((info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) != (NLM_F_CREATE | NLM_F_EXCL)) return -EINVAL; return ieee802154_nl_llsec_change(skb, info, llsec_add_seclevel); } static int llsec_del_seclevel(struct net_device *dev, struct genl_info *info) { struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct ieee802154_llsec_seclevel sl; if (llsec_parse_seclevel(info, &sl)) return -EINVAL; return ops->llsec->del_seclevel(dev, &sl); } int ieee802154_llsec_del_seclevel(struct sk_buff *skb, struct genl_info *info) { return ieee802154_nl_llsec_change(skb, info, llsec_del_seclevel); } static int ieee802154_nl_fill_seclevel(struct sk_buff *msg, u32 portid, u32 seq, const struct ieee802154_llsec_seclevel *sl, const struct net_device *dev) { void *hdr; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, NLM_F_MULTI, IEEE802154_LLSEC_LIST_SECLEVEL); if (!hdr) goto out; if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_FRAME_TYPE, sl->frame_type) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVELS, sl->sec_levels) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE, sl->device_override)) goto nla_put_failure; if (sl->frame_type == IEEE802154_FC_TYPE_MAC_CMD && nla_put_u8(msg, IEEE802154_ATTR_LLSEC_CMD_FRAME_ID, sl->cmd_frame_id)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); out: return -EMSGSIZE; } static int llsec_iter_seclevels(struct llsec_dump_data *data) { struct ieee802154_llsec_seclevel *pos; int rc = 0, idx = 0; list_for_each_entry(pos, &data->table->security_levels, list) { if (idx++ < data->s_idx) continue; if (ieee802154_nl_fill_seclevel(data->skb, data->portid, data->nlmsg_seq, pos, data->dev)) { rc = -EMSGSIZE; break; } data->s_idx++; } return rc; } int ieee802154_llsec_dump_seclevels(struct sk_buff *skb, struct netlink_callback *cb) { return ieee802154_llsec_dump_table(skb, cb, llsec_iter_seclevels); } |
16 16 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | #include <linux/dcache.h> #include "internal.h" unsigned name_to_int(const struct qstr *qstr) { const char *name = qstr->name; int len = qstr->len; unsigned n = 0; if (len > 1 && *name == '0') goto out; do { unsigned c = *name++ - '0'; if (c > 9) goto out; if (n >= (~0U-9)/10) goto out; n *= 10; n += c; } while (--len > 0); return n; out: return ~0U; } |
8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | // SPDX-License-Identifier: GPL-2.0-only /* ipv6header match - matches IPv6 packets based on whether they contain certain headers */ /* Original idea: Brad Chapman * Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6t_ipv6header.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 header types match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; unsigned int temp; int len; u8 nexthdr; unsigned int ptr; /* Make sure this isn't an evil packet */ /* type of the 1st exthdr */ nexthdr = ipv6_hdr(skb)->nexthdr; /* pointer to the 1st exthdr */ ptr = sizeof(struct ipv6hdr); /* available length */ len = skb->len - ptr; temp = 0; while (nf_ip6_ext_hdr(nexthdr)) { const struct ipv6_opt_hdr *hp; struct ipv6_opt_hdr _hdr; int hdrlen; /* No more exthdr -> evaluate */ if (nexthdr == NEXTHDR_NONE) { temp |= MASK_NONE; break; } /* Is there enough space for the next ext header? */ if (len < (int)sizeof(struct ipv6_opt_hdr)) return false; /* ESP -> evaluate */ if (nexthdr == NEXTHDR_ESP) { temp |= MASK_ESP; break; } hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); if (!hp) { par->hotdrop = true; return false; } /* Calculate the header length */ if (nexthdr == NEXTHDR_FRAGMENT) hdrlen = 8; else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); /* set the flag */ switch (nexthdr) { case NEXTHDR_HOP: temp |= MASK_HOPOPTS; break; case NEXTHDR_ROUTING: temp |= MASK_ROUTING; break; case NEXTHDR_FRAGMENT: temp |= MASK_FRAGMENT; break; case NEXTHDR_AUTH: temp |= MASK_AH; break; case NEXTHDR_DEST: temp |= MASK_DSTOPTS; break; default: return false; } nexthdr = hp->nexthdr; len -= hdrlen; ptr += hdrlen; if (ptr > skb->len) break; } if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP) temp |= MASK_PROTO; if (info->modeflag) return !((temp ^ info->matchflags ^ info->invflags) & info->matchflags); else { if (info->invflags) return temp != info->matchflags; else return temp == info->matchflags; } } static int ipv6header_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; /* invflags is 0 or 0xff in hard mode */ if ((!info->modeflag) && info->invflags != 0x00 && info->invflags != 0xFF) return -EINVAL; return 0; } static struct xt_match ipv6header_mt6_reg __read_mostly = { .name = "ipv6header", .family = NFPROTO_IPV6, .match = ipv6header_mt6, .matchsize = sizeof(struct ip6t_ipv6header_info), .checkentry = ipv6header_mt6_check, .destroy = NULL, .me = THIS_MODULE, }; static int __init ipv6header_mt6_init(void) { return xt_register_match(&ipv6header_mt6_reg); } static void __exit ipv6header_mt6_exit(void) { xt_unregister_match(&ipv6header_mt6_reg); } module_init(ipv6header_mt6_init); module_exit(ipv6header_mt6_exit); |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 | /* SPDX-License-Identifier: GPL-2.0 */ /* * This file define the new driver API for Wireless Extensions * * Version : 8 16.3.07 * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 2001-2007 Jean Tourrilhes, All Rights Reserved. */ #ifndef _IW_HANDLER_H #define _IW_HANDLER_H /************************** DOCUMENTATION **************************/ /* * Initial driver API (1996 -> onward) : * ----------------------------------- * The initial API just sends the IOCTL request received from user space * to the driver (via the driver ioctl handler). The driver has to * handle all the rest... * * The initial API also defines a specific handler in struct net_device * to handle wireless statistics. * * The initial APIs served us well and has proven a reasonably good design. * However, there are a few shortcomings : * o No events, everything is a request to the driver. * o Large ioctl function in driver with gigantic switch statement * (i.e. spaghetti code). * o Driver has to mess up with copy_to/from_user, and in many cases * does it unproperly. Common mistakes are : * * buffer overflows (no checks or off by one checks) * * call copy_to/from_user with irq disabled * o The user space interface is tied to ioctl because of the use * copy_to/from_user. * * New driver API (2002 -> onward) : * ------------------------------- * The new driver API is just a bunch of standard functions (handlers), * each handling a specific Wireless Extension. The driver just export * the list of handler it supports, and those will be called appropriately. * * I tried to keep the main advantage of the previous API (simplicity, * efficiency and light weight), and also I provide a good dose of backward * compatibility (most structures are the same, driver can use both API * simultaneously, ...). * Hopefully, I've also addressed the shortcoming of the initial API. * * The advantage of the new API are : * o Handling of Extensions in driver broken in small contained functions * o Tighter checks of ioctl before calling the driver * o Flexible commit strategy (at least, the start of it) * o Backward compatibility (can be mixed with old API) * o Driver doesn't have to worry about memory and user-space issues * The last point is important for the following reasons : * o You are now able to call the new driver API from any API you * want (including from within other parts of the kernel). * o Common mistakes are avoided (buffer overflow, user space copy * with irq disabled and so on). * * The Drawback of the new API are : * o bloat (especially kernel) * o need to migrate existing drivers to new API * My initial testing shows that the new API adds around 3kB to the kernel * and save between 0 and 5kB from a typical driver. * Also, as all structures and data types are unchanged, the migration is * quite straightforward (but tedious). * * --- * * The new driver API is defined below in this file. User space should * not be aware of what's happening down there... * * A new kernel wrapper is in charge of validating the IOCTLs and calling * the appropriate driver handler. This is implemented in : * # net/core/wireless.c * * The driver export the list of handlers in : * # include/linux/netdevice.h (one place) * * The new driver API is available for WIRELESS_EXT >= 13. * Good luck with migration to the new API ;-) */ /* ---------------------- THE IMPLEMENTATION ---------------------- */ /* * Some of the choice I've made are pretty controversial. Defining an * API is very much weighting compromises. This goes into some of the * details and the thinking behind the implementation. * * Implementation goals : * -------------------- * The implementation goals were as follow : * o Obvious : you should not need a PhD to understand what's happening, * the benefit is easier maintenance. * o Flexible : it should accommodate a wide variety of driver * implementations and be as flexible as the old API. * o Lean : it should be efficient memory wise to minimise the impact * on kernel footprint. * o Transparent to user space : the large number of user space * applications that use Wireless Extensions should not need * any modifications. * * Array of functions versus Struct of functions * --------------------------------------------- * 1) Having an array of functions allow the kernel code to access the * handler in a single lookup, which is much more efficient (think hash * table here). * 2) The only drawback is that driver writer may put their handler in * the wrong slot. This is trivial to test (I set the frequency, the * bitrate changes). Once the handler is in the proper slot, it will be * there forever, because the array is only extended at the end. * 3) Backward/forward compatibility : adding new handler just require * extending the array, so you can put newer driver in older kernel * without having to patch the kernel code (and vice versa). * * All handler are of the same generic type * ---------------------------------------- * That's a feature !!! * 1) Having a generic handler allow to have generic code, which is more * efficient. If each of the handler was individually typed I would need * to add a big switch in the kernel (== more bloat). This solution is * more scalable, adding new Wireless Extensions doesn't add new code. * 2) You can use the same handler in different slots of the array. For * hardware, it may be more efficient or logical to handle multiple * Wireless Extensions with a single function, and the API allow you to * do that. (An example would be a single record on the card to control * both bitrate and frequency, the handler would read the old record, * modify it according to info->cmd and rewrite it). * * Functions prototype uses union iwreq_data * ----------------------------------------- * Some would have preferred functions defined this way : * static int mydriver_ioctl_setrate(struct net_device *dev, * long rate, int auto) * 1) The kernel code doesn't "validate" the content of iwreq_data, and * can't do it (different hardware may have different notion of what a * valid frequency is), so we don't pretend that we do it. * 2) The above form is not extendable. If I want to add a flag (for * example to distinguish setting max rate and basic rate), I would * break the prototype. Using iwreq_data is more flexible. * 3) Also, the above form is not generic (see above). * 4) I don't expect driver developer using the wrong field of the * union (Doh !), so static typechecking doesn't add much value. * 5) Lastly, you can skip the union by doing : * static int mydriver_ioctl_setrate(struct net_device *dev, * struct iw_request_info *info, * struct iw_param *rrq, * char *extra) * And then adding the handler in the array like this : * (iw_handler) mydriver_ioctl_setrate, // SIOCSIWRATE * * Using functions and not a registry * ---------------------------------- * Another implementation option would have been for every instance to * define a registry (a struct containing all the Wireless Extensions) * and only have a function to commit the registry to the hardware. * 1) This approach can be emulated by the current code, but not * vice versa. * 2) Some drivers don't keep any configuration in the driver, for them * adding such a registry would be a significant bloat. * 3) The code to translate from Wireless Extension to native format is * needed anyway, so it would not reduce significantely the amount of code. * 4) The current approach only selectively translate Wireless Extensions * to native format and only selectively set, whereas the registry approach * would require to translate all WE and set all parameters for any single * change. * 5) For many Wireless Extensions, the GET operation return the current * dynamic value, not the value that was set. * * This header is <net/iw_handler.h> * --------------------------------- * 1) This header is kernel space only and should not be exported to * user space. Headers in "include/linux/" are exported, headers in * "include/net/" are not. * * Mixed 32/64 bit issues * ---------------------- * The Wireless Extensions are designed to be 64 bit clean, by using only * datatypes with explicit storage size. * There are some issues related to kernel and user space using different * memory model, and in particular 64bit kernel with 32bit user space. * The problem is related to struct iw_point, that contains a pointer * that *may* need to be translated. * This is quite messy. The new API doesn't solve this problem (it can't), * but is a step in the right direction : * 1) Meta data about each ioctl is easily available, so we know what type * of translation is needed. * 2) The move of data between kernel and user space is only done in a single * place in the kernel, so adding specific hooks in there is possible. * 3) In the long term, it allows to move away from using ioctl as the * user space API. * * So many comments and so few code * -------------------------------- * That's a feature. Comments won't bloat the resulting kernel binary. */ /***************************** INCLUDES *****************************/ #include <linux/wireless.h> /* IOCTL user space API */ #include <linux/if_ether.h> /***************************** VERSION *****************************/ /* * This constant is used to know which version of the driver API is * available. Hopefully, this will be pretty stable and no changes * will be needed... * I just plan to increment with each new version. */ #define IW_HANDLER_VERSION 8 /* * Changes : * * V2 to V3 * -------- * - Move event definition in <linux/wireless.h> * - Add Wireless Event support : * o wireless_send_event() prototype * o iwe_stream_add_event/point() inline functions * V3 to V4 * -------- * - Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes * * V4 to V5 * -------- * - Add new spy support : struct iw_spy_data & prototypes * * V5 to V6 * -------- * - Change the way we get to spy_data method for added safety * - Remove spy #ifdef, they are always on -> cleaner code * - Add IW_DESCR_FLAG_NOMAX flag for very large requests * - Start migrating get_wireless_stats to struct iw_handler_def * * V6 to V7 * -------- * - Add struct ieee80211_device pointer in struct iw_public_data * - Remove (struct iw_point *)->pointer from events and streams * - Remove spy_offset from struct iw_handler_def * - Add "check" version of event macros for ieee802.11 stack * * V7 to V8 * ---------- * - Prevent leaking of kernel space in stream on 64 bits. */ /**************************** CONSTANTS ****************************/ /* Enhanced spy support available */ #define IW_WIRELESS_SPY #define IW_WIRELESS_THRSPY /* Special error message for the driver to indicate that we * should do a commit after return from the iw_handler */ #define EIWCOMMIT EINPROGRESS /* Flags available in struct iw_request_info */ #define IW_REQUEST_FLAG_COMPAT 0x0001 /* Compat ioctl call */ /* Type of headers we know about (basically union iwreq_data) */ #define IW_HEADER_TYPE_NULL 0 /* Not available */ #define IW_HEADER_TYPE_CHAR 2 /* char [IFNAMSIZ] */ #define IW_HEADER_TYPE_UINT 4 /* __u32 */ #define IW_HEADER_TYPE_FREQ 5 /* struct iw_freq */ #define IW_HEADER_TYPE_ADDR 6 /* struct sockaddr */ #define IW_HEADER_TYPE_POINT 8 /* struct iw_point */ #define IW_HEADER_TYPE_PARAM 9 /* struct iw_param */ #define IW_HEADER_TYPE_QUAL 10 /* struct iw_quality */ /* Handling flags */ /* Most are not implemented. I just use them as a reminder of some * cool features we might need one day ;-) */ #define IW_DESCR_FLAG_NONE 0x0000 /* Obvious */ /* Wrapper level flags */ #define IW_DESCR_FLAG_DUMP 0x0001 /* Not part of the dump command */ #define IW_DESCR_FLAG_EVENT 0x0002 /* Generate an event on SET */ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ /****************************** TYPES ******************************/ /* ----------------------- WIRELESS HANDLER ----------------------- */ /* * A wireless handler is just a standard function, that looks like the * ioctl handler. * We also define there how a handler list look like... As the Wireless * Extension space is quite dense, we use a simple array, which is faster * (that's the perfect hash table ;-). */ /* * Meta data about the request passed to the iw_handler. * Most handlers can safely ignore what's in there. * The 'cmd' field might come handy if you want to use the same handler * for multiple command... * This struct is also my long term insurance. I can add new fields here * without breaking the prototype of iw_handler... */ struct iw_request_info { __u16 cmd; /* Wireless Extension command */ __u16 flags; /* More to come ;-) */ }; struct net_device; /* * This is how a function handling a Wireless Extension should look * like (both get and set, standard and private). */ typedef int (*iw_handler)(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* * This define all the handler that the driver export. * As you need only one per driver type, please use a static const * shared by all driver instances... Same for the members... * This will be linked from net_device in <linux/netdevice.h> */ struct iw_handler_def { /* Array of handlers for standard ioctls * We will call dev->wireless_handlers->standard[ioctl - SIOCIWFIRST] */ const iw_handler * standard; /* Number of handlers defined (more precisely, index of the * last defined handler + 1) */ __u16 num_standard; #ifdef CONFIG_WEXT_PRIV __u16 num_private; /* Number of private arg description */ __u16 num_private_args; /* Array of handlers for private ioctls * Will call dev->wireless_handlers->private[ioctl - SIOCIWFIRSTPRIV] */ const iw_handler * private; /* Arguments of private handler. This one is just a list, so you * can put it in any order you want and should not leave holes... * We will automatically export that to user space... */ const struct iw_priv_args * private_args; #endif /* New location of get_wireless_stats, to de-bloat struct net_device. * The old pointer in struct net_device will be gradually phased * out, and drivers are encouraged to use this one... */ struct iw_statistics* (*get_wireless_stats)(struct net_device *dev); }; /* ---------------------- IOCTL DESCRIPTION ---------------------- */ /* * One of the main goal of the new interface is to deal entirely with * user space/kernel space memory move. * For that, we need to know : * o if iwreq is a pointer or contain the full data * o what is the size of the data to copy * * For private IOCTLs, we use the same rules as used by iwpriv and * defined in struct iw_priv_args. * * For standard IOCTLs, things are quite different and we need to * use the structures below. Actually, this struct is also more * efficient, but that's another story... */ /* * Describe how a standard IOCTL looks like. */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ __u8 flags; /* Special handling of the request */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ }; /* Need to think of short header translation table. Later. */ /* --------------------- ENHANCED SPY SUPPORT --------------------- */ /* * In the old days, the driver was handling spy support all by itself. * Now, the driver can delegate this task to Wireless Extensions. * It needs to include this struct in its private part and use the * standard spy iw_handler. */ /* * Instance specific spy data, i.e. addresses spied and quality for them. */ struct iw_spy_data { /* --- Standard spy support --- */ int spy_number; u_char spy_address[IW_MAX_SPY][ETH_ALEN]; struct iw_quality spy_stat[IW_MAX_SPY]; /* --- Enhanced spy support (event) */ struct iw_quality spy_thr_low; /* Low threshold */ struct iw_quality spy_thr_high; /* High threshold */ u_char spy_thr_under[IW_MAX_SPY]; }; /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). * Those may be called by driver modules. */ /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); #ifdef CONFIG_WEXT_CORE /* flush all previous wext events - if work is done from netdev notifiers */ void wireless_nlevent_flush(void); #else static inline void wireless_nlevent_flush(void) {} #endif /* We may need a function to send a stream of events to user space. * More on that later... */ /************************* INLINE FUNCTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them */ static inline int iwe_stream_lcp_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_LCP_LEN; #endif return IW_EV_LCP_LEN; } static inline int iwe_stream_point_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_POINT_LEN; #endif return IW_EV_POINT_LEN; } static inline int iwe_stream_event_len_adjust(struct iw_request_info *info, int event_len) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) { event_len -= IW_EV_LCP_LEN; event_len += IW_EV_COMPAT_LCP_LEN; } #endif return event_len; } /*------------------------------------------------------------------*/ /* * Wrapper to add an Wireless Event to a stream of events. */ char *iwe_stream_add_event(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len); static inline char * iwe_stream_add_event_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len) { char *res = iwe_stream_add_event(info, stream, ends, iwe, event_len); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add an short Wireless Event containing a pointer to a * stream of events. */ char *iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra); static inline char * iwe_stream_add_point_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra) { char *res = iwe_stream_add_point(info, stream, ends, iwe, extra); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add a value to a Wireless Event in a stream of events. * Be careful, this one is tricky to use properly : * At the first run, you need to have (value = event + IW_EV_LCP_LEN). */ char *iwe_stream_add_value(struct iw_request_info *info, char *event, char *value, char *ends, struct iw_event *iwe, int event_len); #endif /* _IW_HANDLER_H */ |
19 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #ifndef _NET_SEG6_H #define _NET_SEG6_H #include <linux/net.h> #include <linux/ipv6.h> #include <linux/seg6.h> #include <linux/rhashtable-types.h> static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) { __be32 diff[] = { ~from, to }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, __be32 *to) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], to[0], to[1], to[2], to[3], }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } struct seg6_pernet_data { struct mutex lock; struct in6_addr __rcu *tun_src; #ifdef CONFIG_IPV6_SEG6_HMAC struct rhashtable hmac_infos; #endif }; static inline struct seg6_pernet_data *seg6_pernet(struct net *net) { #if IS_ENABLED(CONFIG_IPV6) return net->ipv6.seg6_data; #else return NULL; #endif } extern int seg6_init(void); extern void seg6_exit(void); #ifdef CONFIG_IPV6_SEG6_LWTUNNEL extern int seg6_iptunnel_init(void); extern void seg6_iptunnel_exit(void); extern int seg6_local_init(void); extern void seg6_local_exit(void); #else static inline int seg6_iptunnel_init(void) { return 0; } static inline void seg6_iptunnel_exit(void) {} static inline int seg6_local_init(void) { return 0; } static inline void seg6_local_exit(void) {} #endif extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced); extern struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags); extern void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); /* If the packet which invoked an ICMP error contains an SRH return * the true destination address from within the SRH, otherwise use the * destination address in the IP header. */ static inline const struct in6_addr *seg6_get_daddr(struct sk_buff *skb, struct inet6_skb_parm *opt) { struct ipv6_sr_hdr *srh; if (opt->flags & IP6SKB_SEG6) { srh = (struct ipv6_sr_hdr *)(skb->data + opt->srhoff); return &srh->segments[0]; } return NULL; } #endif |
3 1 2 3 6 5 1 6 6 1 5 5 6 6 3 6 6 6 6 5 5 6 6 2 5 3 6 6 3 3 1 3 8 2 6 6 6 7 1 1 2 1 1 1 2 1 1 1 1 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 | // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/sort.h> #include <linux/delay.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> #include <linux/fs_parser.h> #include <trace/events/cgroup.h> /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls * Expiring in the middle is a performance problem not a correctness one. * 1 sec should be enough. */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) { /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ return ss->legacy_cftypes == NULL && ss->dfl_cftypes; } /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached * * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroup_root *root; int retval = 0; cgroup_lock(); cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } cgroup_attach_unlock(true); cgroup_unlock(); return retval; } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * * Locking rules between cgroup_post_fork() and the migration path * guarantee that, if a task is forking while being migrated, the new child * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. * * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; int ret; if (cgroup_on_dfl(to)) return -EINVAL; ret = cgroup_migrate_vet_dst(to); if (ret) return ret; cgroup_lock(); cgroup_attach_lock(true); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; /* * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { css_task_iter_start(&from->self, 0, &it); do { task = css_task_iter_next(&it); } while (task && (task->flags & PF_EXITING)); if (task) get_task_struct(task); css_task_iter_end(&it); if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(true); cgroup_unlock(); return ret; } /* * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * */ /* which pidlist file are we talking about? */ enum cgroup_filetype { CGROUP_FILE_PROCS, CGROUP_FILE_TASKS, }; /* * A pidlist is a list of pids that virtually represents the contents of one * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, * a pair (one each for procs, tasks) for each pid namespace that's relevant * to the cgroup. */ struct cgroup_pidlist { /* * used to find which pidlist is wanted. doesn't change as long as * this particular list stays in the list. */ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; /* array of xids */ pid_t *list; /* how many elements the above list has */ int length; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* for delayed destruction */ struct delayed_work destroy_dwork; }; /* * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); mutex_unlock(&cgrp->pidlist_mutex); flush_workqueue(cgroup_pidlist_destroy_wq); BUG_ON(!list_empty(&cgrp->pidlists)); } static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, destroy_dwork); struct cgroup_pidlist *tofree = NULL; mutex_lock(&l->owner->pidlist_mutex); /* * Destroy iff we didn't get queued again. The state won't change * as destroy_dwork can only be queued while locked. */ if (!delayed_work_pending(dwork)) { list_del(&l->links); kvfree(l->list); put_pid_ns(l->key.ns); tofree = l; } mutex_unlock(&l->owner->pidlist_mutex); kfree(tofree); } /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; /* * we presume the 0th element is unique, so i starts at 1. trivial * edge cases first; no work needs to be done for either */ if (length == 0 || length == 1) return length; /* src and dest walk down the list; dest counts unique elements */ for (src = 1; src < length; src++) { /* find next unique element */ while (list[src] == list[src-1]) { src++; if (src == length) goto after; } /* dest always points to where the next unique element goes */ list[dest] = list[src]; dest++; } after: return dest; } /* * The two pid files - task and cgroup.procs - guaranteed that the result * is sorted, which forced this whole pidlist fiasco. As pid order is * different per namespace, each namespace needs differently sorted list, * making it impossible to use, for example, single rbtree of member tasks * sorted by task pointer. As pidlists can be fairly large, allocating one * per open file is dangerous, so cgroup had to implement shared pool of * pidlists keyed by cgroup and namespace. */ static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ struct pid_namespace *ns = task_active_pid_ns(current); lockdep_assert_held(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) if (l->key.type == type && l->key.ns == ns) return l; return NULL; } /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); l = cgroup_pidlist_find(cgrp, type); if (l) return l; /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) return l; INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; /* don't need task_nsproxy() if we're looking at ourself */ l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); return l; } /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct cgroup_pidlist **lp) { pid_t *array; int length; int pid, n = 0; /* used for populating the array */ struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ length = cgroup_task_count(cgrp); array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); if (!array) return -ENOMEM; /* now, populate the array */ css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ if (type == CGROUP_FILE_PROCS) pid = task_tgid_vnr(tsk); else pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } css_task_iter_end(&it); length = n; /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { kvfree(array); return -ENOMEM; } /* store array, freeing old if necessary */ kvfree(l->list); l->list = array; l->length = length; *lp = l; return 0; } /* * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid * in the cgroup->l->list array. */ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to * one more than the last pid shown (or 0 on the first call or * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; int *iter, ret; mutex_lock(&cgrp->pidlist_mutex); /* * !NULL @ctx->procs1.pidlist indicates that this isn't the first * start() after open. If the matching pidlist is around, we can use * that. Look for it. Note that @ctx->procs1.pidlist can't be used * directly. It could already have been destroyed. */ if (ctx->procs1.pidlist) ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ if (!ctx->procs1.pidlist) { ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } l = ctx->procs1.pidlist; if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; if (l->list[mid] == pid) { index = mid; break; } else if (l->list[mid] < pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; *pos = *iter; return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, CGROUP_PIDLIST_DESTROY_DELAY); mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done */ p++; if (p >= end) { (*pos)++; return NULL; } else { *pos = *p; return p; } } static int cgroup_pidlist_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", *(int *)v); return 0; } static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct cgroup *cgrp; struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* * Even if we're attaching all tasks in the thread group, we only need * to check permissions on one of them. Check permissions using the * credentials from file open to protect against inherited fd attacks. */ cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid)) ret = -EACCES; put_cred(tcred); if (ret) goto out_finish; ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ ctx = of->priv; if ((ctx->ns->user_ns != &init_user_ns) || !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); return 0; } static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { seq_puts(seq, "0\n"); return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { return notify_on_release(css->cgroup); } static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } static int cgroup_clone_children_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } /* cgroup core interface files for the legacy hierarchies */ struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, { .name = "tasks", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup1_tasks_write, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; /* Display information about each subsystem and each hierarchy */ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* * Grab the subsystems state racily. No need to add avenue to * cgroup_mutex contention. */ for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } return 0; } /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. * * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; /* it should be kernfs_node belonging to cgroupfs and is a directory */ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || kernfs_type(kn) != KERNFS_DIR) return -EINVAL; /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (READ_ONCE(tsk->__state)) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (tsk->in_iowait) stats->nr_io_wait++; break; } } css_task_iter_end(&it); cgroup_put(cgrp); return 0; } void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path * relative to the root of cgroup file system) as the argument. * * Most likely, this user command will try to rmdir this cgroup. * * This races with the possibility that some other task will be * attached to this cgroup before it is removed, or that some other * user task will 'mkdir' a child cgroup of this cgroup. That's ok. * The presumed 'rmdir' will fail quietly if this cgroup is no longer * unused, and this cgroup will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which * means only wait until the task is successfully execve()'d. The * separate release agent task is forked by call_usermodehelper(), * then control in this thread returns here, without waiting for the * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; /* snoop agent path and exit early if empty */ if (!cgrp->root->release_agent_path[0]) return; /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf || !agentbuf) goto out_free; spin_lock(&release_agent_path_lock); strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0) goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); out_free: kfree(agentbuf); kfree(pathbuf); } /* * cgroup_rename - Only allow simple rename of directories in place. */ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(new_name_str, '\n')) return -EINVAL; if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) return -EIO; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); return ret; } static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_show_option(seq, "release_agent", root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_show_option(seq, "name", root->name); return 0; } enum cgroup1_param { Opt_all, Opt_clone_children, Opt_cpuset_v2_mode, Opt_name, Opt_none, Opt_noprefix, Opt_release_agent, Opt_xattr, Opt_favordynmods, Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("all", Opt_all), fsparam_flag ("clone_children", Opt_clone_children), fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), fsparam_string("name", Opt_name), fsparam_flag ("none", Opt_none), fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), fsparam_flag ("favordynmods", Opt_favordynmods), fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_subsys *ss; struct fs_parse_result result; int opt, i; opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name) || cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", param->key); ctx->subsys_mask |= (1 << i); return 0; } return invalfc(fc, "Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; switch (opt) { case Opt_none: /* Explicitly have no subsystems */ ctx->none = true; break; case Opt_all: ctx->all_ss = true; break; case Opt_noprefix: ctx->flags |= CGRP_ROOT_NOPREFIX; break; case Opt_clone_children: ctx->cpuset_clone_children = true; break; case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; break; case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_nofavordynmods: ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; case Opt_name: /* blocked by boot param? */ if (cgroup_no_v1_named) return -ENOENT; /* Can't specify an empty name */ if (!param->size) return invalfc(fc, "Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) return invalfc(fc, "Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; if (isalnum(c)) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; return invalfc(fc, "Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) return invalfc(fc, "name respecified"); ctx->name = param->string; param->string = NULL; break; } return 0; } static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); u16 mask = U16_MAX; u16 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; /* * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) ctx->all_ss = true; if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) return invalfc(fc, "subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) return invalfc(fc, "Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) return invalfc(fc, "noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) return invalfc(fc, "none used incorrectly"); return 0; } int cgroup1_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = ctx->subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } /* remounting is not allowed for populated hierarchies */ if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (ctx->release_agent) { spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: cgroup_unlock(); return ret; } struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; /* * The guts of cgroup1 mount - find or create cgroup_root to use. * Called with cgroup_mutex held; returns 0 on success, -E... on * error and positive - in case when the candidate is busy dying. * On success it stashes a reference to cgroup_root into given * cgroup_fs_context; that reference is *NOT* counting towards the * cgroup_root refcount. */ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; int i, ret; /* First find the desired set of subsystems */ ret = check_cgroupfs_options(fc); if (ret) return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the * dying subsystems. We just need to ensure that the ones * unmounted previously finish dying and don't care about new ones * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) return 1; /* restart */ cgroup_put(&ss->root->cgrp); } for_each_root(root) { bool name_match = false; if (root == &cgrp_dfl_root) continue; /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ if (ctx->name) { if (strcmp(ctx->name, root->name)) continue; name_match = true; } /* * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ if ((ctx->subsys_mask || ctx->none) && (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; return -EBUSY; } if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); ctx->root = root; return 0; } /* * No such thing, create a new one. name= matching without subsys * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) return invalfc(fc, "No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return -ENOMEM; ctx->root = root; init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); if (!ret) cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); else cgroup_free_root(root); return ret; } int cgroup1_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; /* Check if the caller has permission to mount. */ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); ret = cgroup1_root_to_use(fc); if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { fc_drop_locked(fc); ret = 1; } if (unlikely(ret > 0)) { msleep(10); return restart_syscall(); } return ret; } /** * task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @tsk: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returned. On failure, ERR_PTR is returned. * We limit it to cgroup1 only. */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) { struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup_root *root; unsigned long flags; rcu_read_lock(); for_each_root(root) { /* cgroup1 only*/ if (root == &cgrp_dfl_root) continue; if (root->hierarchy_id != hierarchy_id) continue; spin_lock_irqsave(&css_set_lock, flags); cgrp = task_cgroup_from_root(tsk, root); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); spin_unlock_irqrestore(&css_set_lock, flags); break; } rcu_read_unlock(); return cgrp; } static int __init cgroup1_wq_init(void) { /* * Used to destroy pidlists and separate to serve as flush domain. * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 0, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } core_initcall(cgroup1_wq_init); static int __init cgroup_no_v1(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; if (!strcmp(token, "all")) { cgroup_no_v1_mask = U16_MAX; continue; } if (!strcmp(token, "named")) { cgroup_no_v1_named = true; continue; } for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; cgroup_no_v1_mask |= 1 << i; break; } } return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); |
3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_rateest.h> #include <net/netfilter/xt_rateest.h> static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; struct gnet_stats_rate_est64 sample = {0}; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; gen_estimator_read(&info->est1->rate_est, &sample); if (info->flags & XT_RATEEST_MATCH_DELTA) { bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0; pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0; } else { bps1 = sample.bps; pps1 = sample.pps; } if (info->flags & XT_RATEEST_MATCH_ABS) { bps2 = info->bps2; pps2 = info->pps2; } else { gen_estimator_read(&info->est2->rate_est, &sample); if (info->flags & XT_RATEEST_MATCH_DELTA) { bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0; pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0; } else { bps2 = sample.bps; pps2 = sample.pps; } } switch (info->mode) { case XT_RATEEST_MATCH_LT: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 < bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 < pps2; break; case XT_RATEEST_MATCH_GT: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 > bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 > pps2; break; case XT_RATEEST_MATCH_EQ: if (info->flags & XT_RATEEST_MATCH_BPS) ret &= bps1 == bps2; if (info->flags & XT_RATEEST_MATCH_PPS) ret &= pps1 == pps2; break; } ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false; return ret; } static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) { struct xt_rateest_match_info *info = par->matchinfo; struct xt_rateest *est1, *est2; int ret = -EINVAL; if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | XT_RATEEST_MATCH_REL)) != 1) goto err1; if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS))) goto err1; switch (info->mode) { case XT_RATEEST_MATCH_EQ: case XT_RATEEST_MATCH_LT: case XT_RATEEST_MATCH_GT: break; default: goto err1; } ret = -ENOENT; est1 = xt_rateest_lookup(par->net, info->name1); if (!est1) goto err1; est2 = NULL; if (info->flags & XT_RATEEST_MATCH_REL) { est2 = xt_rateest_lookup(par->net, info->name2); if (!est2) goto err2; } info->est1 = est1; info->est2 = est2; return 0; err2: xt_rateest_put(par->net, est1); err1: return ret; } static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) { struct xt_rateest_match_info *info = par->matchinfo; xt_rateest_put(par->net, info->est1); if (info->est2) xt_rateest_put(par->net, info->est2); } static struct xt_match xt_rateest_mt_reg __read_mostly = { .name = "rateest", .revision = 0, .family = NFPROTO_UNSPEC, .match = xt_rateest_mt, .checkentry = xt_rateest_mt_checkentry, .destroy = xt_rateest_mt_destroy, .matchsize = sizeof(struct xt_rateest_match_info), .usersize = offsetof(struct xt_rateest_match_info, est1), .me = THIS_MODULE, }; static int __init xt_rateest_mt_init(void) { return xt_register_match(&xt_rateest_mt_reg); } static void __exit xt_rateest_mt_fini(void) { xt_unregister_match(&xt_rateest_mt_reg); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("xtables rate estimator match"); MODULE_ALIAS("ipt_rateest"); MODULE_ALIAS("ip6t_rateest"); module_init(xt_rateest_mt_init); module_exit(xt_rateest_mt_fini); |
463 463 466 126 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/srcu.h> #include <linux/interval_tree.h> struct mmu_notifier_subscriptions; struct mmu_notifier; struct mmu_notifier_range; struct mmu_interval_notifier; /** * enum mmu_notifier_event - reason for the mmu notifier callback * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that * move the range * * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like * madvise() or replacing a page by another one, ...). * * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range * ie using the vma access permission (vm_page_prot) to update the whole range * is enough no need to inspect changes to the CPU page table (mprotect() * syscall) * * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for * pages in the range so to mirror those changes the user must inspect the CPU * page table (from the end callback). * * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same * access flags). User should soft dirty the page in the end callback to make * sure that anyone relying on soft dirtiness catch pages that might be written * through non CPU mappings. * * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal * that the mm refcount is zero and the range is no longer accessible. * * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no * longer have exclusive access to the page. When sent during creation of an * exclusive range the owner will be initialised to the value provided by the * caller of make_device_exclusive_range(), otherwise the owner will be NULL. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_CLEAR, MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_RELEASE, MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE, }; #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the * secondary mmu. If this method isn't implemented you've to * be sure that nothing could possibly write to the pages * through the secondary mmu by the time the last thread with * tsk->mm == mm exits. * * As side note: the pages freed after ->release returns could * be immediately reallocated by the gart at an alias physical * address with a different cache model, so if ->release isn't * implemented because all _software_ driven memory accesses * through the secondary mmu are terminated by the time the * last thread of this mm quits, you've also to be sure that * speculative _hardware_ operations can't allocate dirty * cachelines in the cpu that could not be snooped and made * coherent with the other read and write operations happening * through the gart alias address, so leading to memory * corruption. */ void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm); /* * clear_flush_young is called after the VM is * test-and-clearing the young/accessed bitflag in the * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * test_young is called to check the young/accessed bitflag in * the secondary pte. This is used to know if the page is * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address); /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the * locks protecting the reverse maps are held. If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to * invalidate_range_begin/end for the whole duration of the * invalidate_range_begin/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. * * invalidate_range_end() is called when all pages in the * range have been unmapped and the pages have been freed by * the VM. * * The VM will remove the page table entries and potentially * the page between invalidate_range_start() and * invalidate_range_end(). If the page must not be freed * because of pending I/O or other circumstances then the * invalidate_range_start() callback (or the initial mapping * by the driver) must make sure that the refcount is kept * elevated. * * If the driver increases the refcount when the pages are * initially mapped into an address space then either * invalidate_range_start() or invalidate_range_end() may * decrease the refcount. If the refcount is decreased on * invalidate_range_start() then the VM can free pages as page * table entries are removed. If the refcount is only * dropped on invalidate_range_end() then the driver itself * will drop the last refcount but it must take care to flush * any secondary tlb before doing the final free on the * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN if sleeping would be required. * 0 should be returned otherwise. Please note that notifiers that can * fail invalidate_range_start are not allowed to implement * invalidate_range_end, as there is no mechanism for informing the * notifier that its start failed. */ int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); /* * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB * which shares page-tables with the CPU. The * invalidate_range_start()/end() callbacks should not be implemented as * invalidate_secondary_tlbs() already catches the points in time when * an external TLB needs to be flushed. * * This requires arch_invalidate_secondary_tlbs() to be called while * holding the ptl spin-lock and therefore this callback is not allowed * to sleep. * * This is called by architecture code whenever invalidating a TLB * entry. It is assumed that any secondary TLB has the same rules for * when invalidations are required. If this is not the case architecture * code will need to call this explicitly when required for secondary * TLB invalidation. */ void (*arch_invalidate_secondary_tlbs)( struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * These callbacks are used with the get/put interface to manage the * lifetime of the mmu_notifier memory. alloc_notifier() returns a new * notifier for use with the mm. * * free_notifier() is only called after the mmu_notifier has been * fully put, calls to any ops callback are prevented and no ops * callbacks are currently running. It is called from a SRCU callback * and cannot sleep. */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); void (*free_notifier)(struct mmu_notifier *subscription); }; /* * The notifier chains are protected by mmap_lock and/or the reverse map * semaphores. Notifier chains are only changed when all reverse maps and * the mmap_lock locks are taken. * * Therefore notifier chains can only be traversed when either * * 1. mmap_lock is held. * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; struct mm_struct *mm; struct rcu_head rcu; unsigned int users; }; /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); }; struct mmu_interval_notifier { struct interval_tree_node interval_tree; const struct mmu_interval_notifier_ops *ops; struct mm_struct *mm; struct hlist_node deferred_item; unsigned long invalidate_seq; }; #ifdef CONFIG_MMU_NOTIFIER #ifdef CONFIG_LOCKDEP extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; enum mmu_notifier_event event; void *owner; }; static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->notifier_subscriptions); } struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm); static inline struct mmu_notifier * mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { struct mmu_notifier *ret; mmap_write_lock(mm); ret = mmu_notifier_get_locked(ops, mm); mmap_write_unlock(mm); return ret; } void mmu_notifier_put(struct mmu_notifier *subscription); void mmu_notifier_synchronize(void); extern int mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *subscription, struct mm_struct *mm); unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub); int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); int mmu_interval_notifier_insert_locked( struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence * @interval_sub - The subscription passed to invalidate * @cur_seq - The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call * mmu_interval_read_retry(). It updates the sequence number for later use by * mmu_interval_read_retry(). The provided cur_seq will always be odd. * * If the caller does not call mmu_interval_read_begin() or * mmu_interval_read_retry() then this call is not required. */ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, unsigned long cur_seq) { WRITE_ONCE(interval_sub->invalidate_seq, cur_seq); } /** * mmu_interval_read_retry - End a read side critical section against a VA range * interval_sub: The subscription * seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). * * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * * Returns true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { return interval_sub->invalidate_seq != seq; } /** * mmu_interval_check_retry - Test if a collision has occurred * interval_sub: The subscription * seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() * and mmu_interval_read_retry(). A return of true indicates an invalidation * has collided with this critical region and a future * mmu_interval_read_retry() will return true. * * False is not reliable and only suggests a collision may not have * occurred. It can be called many times and does not have to hold the user * provided lock. * * This call can be used as part of loops and other expensive operations to * expedite a retry. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ return READ_ONCE(interval_sub->invalidate_seq) != seq; } extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_release(mm); } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { might_sleep(); lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); } /* * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it * can return an error if a notifier can't proceed without blocking, in which * case you're not allowed to modify PTEs in the specified range. * * This is mainly intended for OOM handling. */ static inline int __must_check mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { int ret = 0; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; ret = __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); return ret; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { if (mmu_notifier_range_blockable(range)) might_sleep(); if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { mm->notifier_subscriptions = NULL; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); } static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, struct mm_struct *mm, unsigned long start, unsigned long end) { range->event = event; range->mm = mm; range->start = start; range->end = end; range->flags = flags; } static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, struct mm_struct *mm, unsigned long start, unsigned long end, void *owner) { mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PAGE_SIZE); \ __young; \ }) #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PMD_SIZE); \ __young; \ }) #define ptep_clear_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PAGE_SIZE); \ __young; \ }) #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PMD_SIZE); \ __young; \ }) #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { unsigned long start; unsigned long end; }; static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { range->start = start; range->end = end; } #define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) #define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return true; } static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; } static inline void mmu_notifier_release(struct mm_struct *mm) { } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { return 0; } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { return 0; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { } #define mmu_notifier_range_update_to_read_only(r) false #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush static inline void mmu_notifier_synchronize(void) { } #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ |
1 1 1 1 1 1 1 8 8 8 1 7 7 7 1 10 9 10 10 10 1 9 2 8 8 10 8 1 4 1 1 1 3 2 2 2 2 2 1 1 2 2 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 3 2 1 5 2 1 1 1 1 1 1 3 2 1 1 1 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 | // SPDX-License-Identifier: GPL-2.0-only /* * HWSIM IEEE 802.15.4 interface * * (C) 2018 Mojatau, Alexander Aring <aring@mojatau.com> * Copyright 2007-2012 Siemens AG * * Based on fakelb, original Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/module.h> #include <linux/timer.h> #include <linux/platform_device.h> #include <linux/rtnetlink.h> #include <linux/netdevice.h> #include <linux/device.h> #include <linux/spinlock.h> #include <net/ieee802154_netdev.h> #include <net/mac802154.h> #include <net/cfg802154.h> #include <net/genetlink.h> #include "mac802154_hwsim.h" MODULE_DESCRIPTION("Software simulator of IEEE 802.15.4 radio(s) for mac802154"); MODULE_LICENSE("GPL"); static LIST_HEAD(hwsim_phys); static DEFINE_MUTEX(hwsim_phys_lock); static struct platform_device *mac802154hwsim_dev; /* MAC802154_HWSIM netlink family */ static struct genl_family hwsim_genl_family; static int hwsim_radio_idx; enum hwsim_multicast_groups { HWSIM_MCGRP_CONFIG, }; static const struct genl_multicast_group hwsim_mcgrps[] = { [HWSIM_MCGRP_CONFIG] = { .name = "config", }, }; struct hwsim_pib { u8 page; u8 channel; struct ieee802154_hw_addr_filt filt; enum ieee802154_filtering_level filt_level; struct rcu_head rcu; }; struct hwsim_edge_info { u8 lqi; struct rcu_head rcu; }; struct hwsim_edge { struct hwsim_phy *endpoint; struct hwsim_edge_info __rcu *info; struct list_head list; struct rcu_head rcu; }; struct hwsim_phy { struct ieee802154_hw *hw; u32 idx; struct hwsim_pib __rcu *pib; bool suspended; struct list_head edges; struct list_head list; }; static int hwsim_add_one(struct genl_info *info, struct device *dev, bool init); static void hwsim_del(struct hwsim_phy *phy); static int hwsim_hw_ed(struct ieee802154_hw *hw, u8 *level) { *level = 0xbe; return 0; } static int hwsim_update_pib(struct ieee802154_hw *hw, u8 page, u8 channel, struct ieee802154_hw_addr_filt *filt, enum ieee802154_filtering_level filt_level) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib, *pib_old; pib = kzalloc(sizeof(*pib), GFP_ATOMIC); if (!pib) return -ENOMEM; pib_old = rtnl_dereference(phy->pib); pib->page = page; pib->channel = channel; pib->filt.short_addr = filt->short_addr; pib->filt.pan_id = filt->pan_id; pib->filt.ieee_addr = filt->ieee_addr; pib->filt.pan_coord = filt->pan_coord; pib->filt_level = filt_level; rcu_assign_pointer(phy->pib, pib); kfree_rcu(pib_old, rcu); return 0; } static int hwsim_hw_channel(struct ieee802154_hw *hw, u8 page, u8 channel) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, page, channel, &pib->filt, pib->filt_level); rcu_read_unlock(); return ret; } static int hwsim_hw_addr_filt(struct ieee802154_hw *hw, struct ieee802154_hw_addr_filt *filt, unsigned long changed) { struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, pib->page, pib->channel, filt, pib->filt_level); rcu_read_unlock(); return ret; } static void hwsim_hw_receive(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) { struct ieee802154_hdr hdr; struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; rcu_read_lock(); pib = rcu_dereference(phy->pib); if (!pskb_may_pull(skb, 3)) { dev_dbg(hw->parent, "invalid frame\n"); goto drop; } memcpy(&hdr, skb->data, 3); /* Level 4 filtering: Frame fields validity */ if (pib->filt_level == IEEE802154_FILTERING_4_FRAME_FIELDS) { /* a) Drop reserved frame types */ switch (mac_cb(skb)->type) { case IEEE802154_FC_TYPE_BEACON: case IEEE802154_FC_TYPE_DATA: case IEEE802154_FC_TYPE_ACK: case IEEE802154_FC_TYPE_MAC_CMD: break; default: dev_dbg(hw->parent, "unrecognized frame type 0x%x\n", mac_cb(skb)->type); goto drop; } /* b) Drop reserved frame versions */ switch (hdr.fc.version) { case IEEE802154_2003_STD: case IEEE802154_2006_STD: case IEEE802154_STD: break; default: dev_dbg(hw->parent, "unrecognized frame version 0x%x\n", hdr.fc.version); goto drop; } /* c) PAN ID constraints */ if ((mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG || mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT) && mac_cb(skb)->dest.pan_id != pib->filt.pan_id && mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) { dev_dbg(hw->parent, "unrecognized PAN ID %04x\n", le16_to_cpu(mac_cb(skb)->dest.pan_id)); goto drop; } /* d1) Short address constraints */ if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT && mac_cb(skb)->dest.short_addr != pib->filt.short_addr && mac_cb(skb)->dest.short_addr != cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { dev_dbg(hw->parent, "unrecognized short address %04x\n", le16_to_cpu(mac_cb(skb)->dest.short_addr)); goto drop; } /* d2) Extended address constraints */ if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG && mac_cb(skb)->dest.extended_addr != pib->filt.ieee_addr) { dev_dbg(hw->parent, "unrecognized long address 0x%016llx\n", mac_cb(skb)->dest.extended_addr); goto drop; } /* d4) Specific PAN coordinator case (no parent) */ if ((mac_cb(skb)->type == IEEE802154_FC_TYPE_DATA || mac_cb(skb)->type == IEEE802154_FC_TYPE_MAC_CMD) && mac_cb(skb)->dest.mode == IEEE802154_ADDR_NONE) { dev_dbg(hw->parent, "relaying is not supported\n"); goto drop; } /* e) Beacon frames follow specific PAN ID rules */ if (mac_cb(skb)->type == IEEE802154_FC_TYPE_BEACON && pib->filt.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST) && mac_cb(skb)->dest.pan_id != pib->filt.pan_id) { dev_dbg(hw->parent, "invalid beacon PAN ID %04x\n", le16_to_cpu(mac_cb(skb)->dest.pan_id)); goto drop; } } rcu_read_unlock(); ieee802154_rx_irqsafe(hw, skb, lqi); return; drop: rcu_read_unlock(); kfree_skb(skb); } static int hwsim_hw_xmit(struct ieee802154_hw *hw, struct sk_buff *skb) { struct hwsim_phy *current_phy = hw->priv; struct hwsim_pib *current_pib, *endpoint_pib; struct hwsim_edge_info *einfo; struct hwsim_edge *e; WARN_ON(current_phy->suspended); rcu_read_lock(); current_pib = rcu_dereference(current_phy->pib); list_for_each_entry_rcu(e, ¤t_phy->edges, list) { /* Can be changed later in rx_irqsafe, but this is only a * performance tweak. Received radio should drop the frame * in mac802154 stack anyway... so we don't need to be * 100% of locking here to check on suspended */ if (e->endpoint->suspended) continue; endpoint_pib = rcu_dereference(e->endpoint->pib); if (current_pib->page == endpoint_pib->page && current_pib->channel == endpoint_pib->channel) { struct sk_buff *newskb = pskb_copy(skb, GFP_ATOMIC); einfo = rcu_dereference(e->info); if (newskb) hwsim_hw_receive(e->endpoint->hw, newskb, einfo->lqi); } } rcu_read_unlock(); ieee802154_xmit_complete(hw, skb, false); return 0; } static int hwsim_hw_start(struct ieee802154_hw *hw) { struct hwsim_phy *phy = hw->priv; phy->suspended = false; return 0; } static void hwsim_hw_stop(struct ieee802154_hw *hw) { struct hwsim_phy *phy = hw->priv; phy->suspended = true; } static int hwsim_set_promiscuous_mode(struct ieee802154_hw *hw, const bool on) { enum ieee802154_filtering_level filt_level; struct hwsim_phy *phy = hw->priv; struct hwsim_pib *pib; int ret; if (on) filt_level = IEEE802154_FILTERING_NONE; else filt_level = IEEE802154_FILTERING_4_FRAME_FIELDS; rcu_read_lock(); pib = rcu_dereference(phy->pib); ret = hwsim_update_pib(hw, pib->page, pib->channel, &pib->filt, filt_level); rcu_read_unlock(); return ret; } static const struct ieee802154_ops hwsim_ops = { .owner = THIS_MODULE, .xmit_async = hwsim_hw_xmit, .ed = hwsim_hw_ed, .set_channel = hwsim_hw_channel, .start = hwsim_hw_start, .stop = hwsim_hw_stop, .set_promiscuous_mode = hwsim_set_promiscuous_mode, .set_hw_addr_filt = hwsim_hw_addr_filt, }; static int hwsim_new_radio_nl(struct sk_buff *msg, struct genl_info *info) { return hwsim_add_one(info, &mac802154hwsim_dev->dev, false); } static int hwsim_del_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_phy *phy, *tmp; s64 idx = -1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) { if (idx == phy->idx) { hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); return 0; } } mutex_unlock(&hwsim_phys_lock); return -ENODEV; } static int append_radio_msg(struct sk_buff *skb, struct hwsim_phy *phy) { struct nlattr *nl_edges, *nl_edge; struct hwsim_edge_info *einfo; struct hwsim_edge *e; int ret; ret = nla_put_u32(skb, MAC802154_HWSIM_ATTR_RADIO_ID, phy->idx); if (ret < 0) return ret; rcu_read_lock(); if (list_empty(&phy->edges)) { rcu_read_unlock(); return 0; } nl_edges = nla_nest_start_noflag(skb, MAC802154_HWSIM_ATTR_RADIO_EDGES); if (!nl_edges) { rcu_read_unlock(); return -ENOBUFS; } list_for_each_entry_rcu(e, &phy->edges, list) { nl_edge = nla_nest_start_noflag(skb, MAC802154_HWSIM_ATTR_RADIO_EDGE); if (!nl_edge) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edges); return -ENOBUFS; } ret = nla_put_u32(skb, MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID, e->endpoint->idx); if (ret < 0) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edge); nla_nest_cancel(skb, nl_edges); return ret; } einfo = rcu_dereference(e->info); ret = nla_put_u8(skb, MAC802154_HWSIM_EDGE_ATTR_LQI, einfo->lqi); if (ret < 0) { rcu_read_unlock(); nla_nest_cancel(skb, nl_edge); nla_nest_cancel(skb, nl_edges); return ret; } nla_nest_end(skb, nl_edge); } rcu_read_unlock(); nla_nest_end(skb, nl_edges); return 0; } static int hwsim_get_radio(struct sk_buff *skb, struct hwsim_phy *phy, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; int res; hdr = genlmsg_put(skb, portid, seq, &hwsim_genl_family, flags, MAC802154_HWSIM_CMD_GET_RADIO); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); res = append_radio_msg(skb, phy); if (res < 0) goto out_err; genlmsg_end(skb, hdr); return 0; out_err: genlmsg_cancel(skb, hdr); return res; } static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info) { struct hwsim_phy *phy; struct sk_buff *skb; int idx, res = -ENODEV; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]) return -EINVAL; idx = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); mutex_lock(&hwsim_phys_lock); list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx != idx) continue; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) { res = -ENOMEM; goto out_err; } res = hwsim_get_radio(skb, phy, info->snd_portid, info->snd_seq, NULL, 0); if (res < 0) { nlmsg_free(skb); goto out_err; } res = genlmsg_reply(skb, info); break; } out_err: mutex_unlock(&hwsim_phys_lock); return res; } static int hwsim_dump_radio_nl(struct sk_buff *skb, struct netlink_callback *cb) { int idx = cb->args[0]; struct hwsim_phy *phy; int res; mutex_lock(&hwsim_phys_lock); if (idx == hwsim_radio_idx) goto done; list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx < idx) continue; res = hwsim_get_radio(skb, phy, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (res < 0) break; idx = phy->idx + 1; } cb->args[0] = idx; done: mutex_unlock(&hwsim_phys_lock); return skb->len; } /* caller need to held hwsim_phys_lock */ static struct hwsim_phy *hwsim_get_radio_by_id(uint32_t idx) { struct hwsim_phy *phy; list_for_each_entry(phy, &hwsim_phys, list) { if (phy->idx == idx) return phy; } return NULL; } static const struct nla_policy hwsim_edge_policy[MAC802154_HWSIM_EDGE_ATTR_MAX + 1] = { [MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] = { .type = NLA_U32 }, [MAC802154_HWSIM_EDGE_ATTR_LQI] = { .type = NLA_U8 }, }; static struct hwsim_edge *hwsim_alloc_edge(struct hwsim_phy *endpoint, u8 lqi) { struct hwsim_edge_info *einfo; struct hwsim_edge *e; e = kzalloc(sizeof(*e), GFP_KERNEL); if (!e) return NULL; einfo = kzalloc(sizeof(*einfo), GFP_KERNEL); if (!einfo) { kfree(e); return NULL; } einfo->lqi = 0xff; rcu_assign_pointer(e->info, einfo); e->endpoint = endpoint; return e; } static void hwsim_free_edge(struct hwsim_edge *e) { struct hwsim_edge_info *einfo; rcu_read_lock(); einfo = rcu_dereference(e->info); rcu_read_unlock(); kfree_rcu(einfo, rcu); kfree_rcu(e, rcu); } static int hwsim_new_edge_nl(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_phy *phy_v0, *phy_v1; struct hwsim_edge *e; u32 v0, v1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); if (v0 == v1) return -EINVAL; mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } phy_v1 = hwsim_get_radio_by_id(v1); if (!phy_v1) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { mutex_unlock(&hwsim_phys_lock); rcu_read_unlock(); return -EEXIST; } } rcu_read_unlock(); e = hwsim_alloc_edge(phy_v1, 0xff); if (!e) { mutex_unlock(&hwsim_phys_lock); return -ENOMEM; } list_add_rcu(&e->list, &phy_v0->edges); /* wait until changes are done under hwsim_phys_lock lock * should prevent of calling this function twice while * edges list has not the changes yet. */ synchronize_rcu(); mutex_unlock(&hwsim_phys_lock); return 0; } static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { rcu_read_unlock(); list_del_rcu(&e->list); hwsim_free_edge(e); /* same again - wait until list changes are done */ synchronize_rcu(); mutex_unlock(&hwsim_phys_lock); return 0; } } rcu_read_unlock(); mutex_unlock(&hwsim_phys_lock); return -ENOENT; } static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) { struct nlattr *edge_attrs[MAC802154_HWSIM_EDGE_ATTR_MAX + 1]; struct hwsim_edge_info *einfo, *einfo_old; struct hwsim_phy *phy_v0; struct hwsim_edge *e; u32 v0, v1; u8 lqi; if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] || !edge_attrs[MAC802154_HWSIM_EDGE_ATTR_LQI]) return -EINVAL; v0 = nla_get_u32(info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID]); v1 = nla_get_u32(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID]); lqi = nla_get_u8(edge_attrs[MAC802154_HWSIM_EDGE_ATTR_LQI]); mutex_lock(&hwsim_phys_lock); phy_v0 = hwsim_get_radio_by_id(v0); if (!phy_v0) { mutex_unlock(&hwsim_phys_lock); return -ENOENT; } einfo = kzalloc(sizeof(*einfo), GFP_KERNEL); if (!einfo) { mutex_unlock(&hwsim_phys_lock); return -ENOMEM; } rcu_read_lock(); list_for_each_entry_rcu(e, &phy_v0->edges, list) { if (e->endpoint->idx == v1) { einfo->lqi = lqi; einfo_old = rcu_replace_pointer(e->info, einfo, lockdep_is_held(&hwsim_phys_lock)); rcu_read_unlock(); kfree_rcu(einfo_old, rcu); mutex_unlock(&hwsim_phys_lock); return 0; } } rcu_read_unlock(); kfree(einfo); mutex_unlock(&hwsim_phys_lock); return -ENOENT; } /* MAC802154_HWSIM netlink policy */ static const struct nla_policy hwsim_genl_policy[MAC802154_HWSIM_ATTR_MAX + 1] = { [MAC802154_HWSIM_ATTR_RADIO_ID] = { .type = NLA_U32 }, [MAC802154_HWSIM_ATTR_RADIO_EDGE] = { .type = NLA_NESTED }, [MAC802154_HWSIM_ATTR_RADIO_EDGES] = { .type = NLA_NESTED }, }; /* Generic Netlink operations array */ static const struct genl_small_ops hwsim_nl_ops[] = { { .cmd = MAC802154_HWSIM_CMD_NEW_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_DEL_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_radio_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_GET_RADIO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_get_radio_nl, .dumpit = hwsim_dump_radio_nl, }, { .cmd = MAC802154_HWSIM_CMD_NEW_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_new_edge_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_DEL_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_del_edge_nl, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MAC802154_HWSIM_CMD_SET_EDGE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = hwsim_set_edge_lqi, .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family hwsim_genl_family __ro_after_init = { .name = "MAC802154_HWSIM", .version = 1, .maxattr = MAC802154_HWSIM_ATTR_MAX, .policy = hwsim_genl_policy, .module = THIS_MODULE, .small_ops = hwsim_nl_ops, .n_small_ops = ARRAY_SIZE(hwsim_nl_ops), .resv_start_op = MAC802154_HWSIM_CMD_NEW_EDGE + 1, .mcgrps = hwsim_mcgrps, .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), }; static void hwsim_mcast_config_msg(struct sk_buff *mcast_skb, struct genl_info *info) { if (info) genl_notify(&hwsim_genl_family, mcast_skb, info, HWSIM_MCGRP_CONFIG, GFP_KERNEL); else genlmsg_multicast(&hwsim_genl_family, mcast_skb, 0, HWSIM_MCGRP_CONFIG, GFP_KERNEL); } static void hwsim_mcast_new_radio(struct genl_info *info, struct hwsim_phy *phy) { struct sk_buff *mcast_skb; void *data; mcast_skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!mcast_skb) return; data = genlmsg_put(mcast_skb, 0, 0, &hwsim_genl_family, 0, MAC802154_HWSIM_CMD_NEW_RADIO); if (!data) goto out_err; if (append_radio_msg(mcast_skb, phy) < 0) goto out_err; genlmsg_end(mcast_skb, data); hwsim_mcast_config_msg(mcast_skb, info); return; out_err: genlmsg_cancel(mcast_skb, data); nlmsg_free(mcast_skb); } static void hwsim_edge_unsubscribe_me(struct hwsim_phy *phy) { struct hwsim_phy *tmp; struct hwsim_edge *e; rcu_read_lock(); /* going to all phy edges and remove phy from it */ list_for_each_entry(tmp, &hwsim_phys, list) { list_for_each_entry_rcu(e, &tmp->edges, list) { if (e->endpoint->idx == phy->idx) { list_del_rcu(&e->list); hwsim_free_edge(e); } } } rcu_read_unlock(); synchronize_rcu(); } static int hwsim_subscribe_all_others(struct hwsim_phy *phy) { struct hwsim_phy *sub; struct hwsim_edge *e; list_for_each_entry(sub, &hwsim_phys, list) { e = hwsim_alloc_edge(sub, 0xff); if (!e) goto me_fail; list_add_rcu(&e->list, &phy->edges); } list_for_each_entry(sub, &hwsim_phys, list) { e = hwsim_alloc_edge(phy, 0xff); if (!e) goto sub_fail; list_add_rcu(&e->list, &sub->edges); } return 0; sub_fail: hwsim_edge_unsubscribe_me(phy); me_fail: rcu_read_lock(); list_for_each_entry_rcu(e, &phy->edges, list) { list_del_rcu(&e->list); hwsim_free_edge(e); } rcu_read_unlock(); return -ENOMEM; } static int hwsim_add_one(struct genl_info *info, struct device *dev, bool init) { struct ieee802154_hw *hw; struct hwsim_phy *phy; struct hwsim_pib *pib; int idx; int err; idx = hwsim_radio_idx++; hw = ieee802154_alloc_hw(sizeof(*phy), &hwsim_ops); if (!hw) return -ENOMEM; phy = hw->priv; phy->hw = hw; /* 868 MHz BPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 1; /* 915 MHz BPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 0x7fe; /* 2.4 GHz O-QPSK 802.15.4-2003 */ hw->phy->supported.channels[0] |= 0x7FFF800; /* 868 MHz ASK 802.15.4-2006 */ hw->phy->supported.channels[1] |= 1; /* 915 MHz ASK 802.15.4-2006 */ hw->phy->supported.channels[1] |= 0x7fe; /* 868 MHz O-QPSK 802.15.4-2006 */ hw->phy->supported.channels[2] |= 1; /* 915 MHz O-QPSK 802.15.4-2006 */ hw->phy->supported.channels[2] |= 0x7fe; /* 2.4 GHz CSS 802.15.4a-2007 */ hw->phy->supported.channels[3] |= 0x3fff; /* UWB Sub-gigahertz 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 1; /* UWB Low band 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 0x1e; /* UWB High band 802.15.4a-2007 */ hw->phy->supported.channels[4] |= 0xffe0; /* 750 MHz O-QPSK 802.15.4c-2009 */ hw->phy->supported.channels[5] |= 0xf; /* 750 MHz MPSK 802.15.4c-2009 */ hw->phy->supported.channels[5] |= 0xf0; /* 950 MHz BPSK 802.15.4d-2009 */ hw->phy->supported.channels[6] |= 0x3ff; /* 950 MHz GFSK 802.15.4d-2009 */ hw->phy->supported.channels[6] |= 0x3ffc00; ieee802154_random_extended_addr(&hw->phy->perm_extended_addr); /* hwsim phy channel 13 as default */ hw->phy->current_channel = 13; pib = kzalloc(sizeof(*pib), GFP_KERNEL); if (!pib) { err = -ENOMEM; goto err_pib; } pib->channel = 13; pib->filt.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); pib->filt.pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); rcu_assign_pointer(phy->pib, pib); phy->idx = idx; INIT_LIST_HEAD(&phy->edges); hw->flags = IEEE802154_HW_PROMISCUOUS; hw->parent = dev; err = ieee802154_register_hw(hw); if (err) goto err_reg; mutex_lock(&hwsim_phys_lock); if (init) { err = hwsim_subscribe_all_others(phy); if (err < 0) { mutex_unlock(&hwsim_phys_lock); goto err_subscribe; } } list_add_tail(&phy->list, &hwsim_phys); mutex_unlock(&hwsim_phys_lock); hwsim_mcast_new_radio(info, phy); return idx; err_subscribe: ieee802154_unregister_hw(phy->hw); err_reg: kfree(pib); err_pib: ieee802154_free_hw(phy->hw); return err; } static void hwsim_del(struct hwsim_phy *phy) { struct hwsim_pib *pib; struct hwsim_edge *e; hwsim_edge_unsubscribe_me(phy); list_del(&phy->list); rcu_read_lock(); list_for_each_entry_rcu(e, &phy->edges, list) { list_del_rcu(&e->list); hwsim_free_edge(e); } pib = rcu_dereference(phy->pib); rcu_read_unlock(); kfree_rcu(pib, rcu); ieee802154_unregister_hw(phy->hw); ieee802154_free_hw(phy->hw); } static int hwsim_probe(struct platform_device *pdev) { struct hwsim_phy *phy, *tmp; int err, i; for (i = 0; i < 2; i++) { err = hwsim_add_one(NULL, &pdev->dev, true); if (err < 0) goto err_slave; } dev_info(&pdev->dev, "Added 2 mac802154 hwsim hardware radios\n"); return 0; err_slave: mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); return err; } static void hwsim_remove(struct platform_device *pdev) { struct hwsim_phy *phy, *tmp; mutex_lock(&hwsim_phys_lock); list_for_each_entry_safe(phy, tmp, &hwsim_phys, list) hwsim_del(phy); mutex_unlock(&hwsim_phys_lock); } static struct platform_driver mac802154hwsim_driver = { .probe = hwsim_probe, .remove = hwsim_remove, .driver = { .name = "mac802154_hwsim", }, }; static __init int hwsim_init_module(void) { int rc; rc = genl_register_family(&hwsim_genl_family); if (rc) return rc; mac802154hwsim_dev = platform_device_register_simple("mac802154_hwsim", -1, NULL, 0); if (IS_ERR(mac802154hwsim_dev)) { rc = PTR_ERR(mac802154hwsim_dev); goto platform_dev; } rc = platform_driver_register(&mac802154hwsim_driver); if (rc < 0) goto platform_drv; return 0; platform_drv: platform_device_unregister(mac802154hwsim_dev); platform_dev: genl_unregister_family(&hwsim_genl_family); return rc; } static __exit void hwsim_remove_module(void) { genl_unregister_family(&hwsim_genl_family); platform_driver_unregister(&mac802154hwsim_driver); platform_device_unregister(mac802154hwsim_dev); } module_init(hwsim_init_module); module_exit(hwsim_remove_module); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 6 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 | /* * Cryptographic API. * * Anubis Algorithm * * The Anubis algorithm was developed by Paulo S. L. M. Barreto and * Vincent Rijmen. * * See * * P.S.L.M. Barreto, V. Rijmen, * ``The Anubis block cipher,'' * NESSIE submission, 2000. * * This software implements the "tweaked" version of Anubis. * Only the S-box and (consequently) the rounds constants have been * changed. * * The original authors have disclaimed all copyright interest in this * code and thus put it in the public domain. The subsequent authors * have put this under the GNU General Public License. * * By Aaron Grothe ajgrothe@yahoo.com, October 28, 2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * */ #include <crypto/algapi.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <linux/types.h> #define ANUBIS_MIN_KEY_SIZE 16 #define ANUBIS_MAX_KEY_SIZE 40 #define ANUBIS_BLOCK_SIZE 16 #define ANUBIS_MAX_N 10 #define ANUBIS_MAX_ROUNDS (8 + ANUBIS_MAX_N) struct anubis_ctx { int key_len; // in bits int R; u32 E[ANUBIS_MAX_ROUNDS + 1][4]; u32 D[ANUBIS_MAX_ROUNDS + 1][4]; }; static const u32 T0[256] = { 0xba69d2bbU, 0x54a84de5U, 0x2f5ebce2U, 0x74e8cd25U, 0x53a651f7U, 0xd3bb6bd0U, 0xd2b96fd6U, 0x4d9a29b3U, 0x50a05dfdU, 0xac458acfU, 0x8d070e09U, 0xbf63c6a5U, 0x70e0dd3dU, 0x52a455f1U, 0x9a29527bU, 0x4c982db5U, 0xeac98f46U, 0xd5b773c4U, 0x97336655U, 0xd1bf63dcU, 0x3366ccaaU, 0x51a259fbU, 0x5bb671c7U, 0xa651a2f3U, 0xdea15ffeU, 0x48903dadU, 0xa84d9ad7U, 0x992f5e71U, 0xdbab4be0U, 0x3264c8acU, 0xb773e695U, 0xfce5d732U, 0xe3dbab70U, 0x9e214263U, 0x913f7e41U, 0x9b2b567dU, 0xe2d9af76U, 0xbb6bd6bdU, 0x4182199bU, 0x6edca579U, 0xa557aef9U, 0xcb8b0b80U, 0x6bd6b167U, 0x95376e59U, 0xa15fbee1U, 0xf3fbeb10U, 0xb17ffe81U, 0x0204080cU, 0xcc851792U, 0xc49537a2U, 0x1d3a744eU, 0x14285078U, 0xc39b2bb0U, 0x63c69157U, 0xdaa94fe6U, 0x5dba69d3U, 0x5fbe61dfU, 0xdca557f2U, 0x7dfae913U, 0xcd871394U, 0x7ffee11fU, 0x5ab475c1U, 0x6cd8ad75U, 0x5cb86dd5U, 0xf7f3fb08U, 0x264c98d4U, 0xffe3db38U, 0xedc79354U, 0xe8cd874aU, 0x9d274e69U, 0x6fdea17fU, 0x8e010203U, 0x19326456U, 0xa05dbae7U, 0xf0fde71aU, 0x890f1e11U, 0x0f1e3c22U, 0x070e1c12U, 0xaf4386c5U, 0xfbebcb20U, 0x08102030U, 0x152a547eU, 0x0d1a342eU, 0x04081018U, 0x01020406U, 0x64c88d45U, 0xdfa35bf8U, 0x76ecc529U, 0x79f2f90bU, 0xdda753f4U, 0x3d7af48eU, 0x162c5874U, 0x3f7efc82U, 0x376edcb2U, 0x6ddaa973U, 0x3870e090U, 0xb96fdeb1U, 0x73e6d137U, 0xe9cf834cU, 0x356ad4beU, 0x55aa49e3U, 0x71e2d93bU, 0x7bf6f107U, 0x8c050a0fU, 0x72e4d531U, 0x880d1a17U, 0xf6f1ff0eU, 0x2a54a8fcU, 0x3e7cf884U, 0x5ebc65d9U, 0x274e9cd2U, 0x468c0589U, 0x0c183028U, 0x65ca8943U, 0x68d0bd6dU, 0x61c2995bU, 0x03060c0aU, 0xc19f23bcU, 0x57ae41efU, 0xd6b17fceU, 0xd9af43ecU, 0x58b07dcdU, 0xd8ad47eaU, 0x66cc8549U, 0xd7b37bc8U, 0x3a74e89cU, 0xc88d078aU, 0x3c78f088U, 0xfae9cf26U, 0x96316253U, 0xa753a6f5U, 0x982d5a77U, 0xecc59752U, 0xb86ddab7U, 0xc7933ba8U, 0xae4182c3U, 0x69d2b96bU, 0x4b9631a7U, 0xab4b96ddU, 0xa94f9ed1U, 0x67ce814fU, 0x0a14283cU, 0x478e018fU, 0xf2f9ef16U, 0xb577ee99U, 0x224488ccU, 0xe5d7b364U, 0xeec19f5eU, 0xbe61c2a3U, 0x2b56acfaU, 0x811f3e21U, 0x1224486cU, 0x831b362dU, 0x1b366c5aU, 0x0e1c3824U, 0x23468ccaU, 0xf5f7f304U, 0x458a0983U, 0x214284c6U, 0xce811f9eU, 0x499239abU, 0x2c58b0e8U, 0xf9efc32cU, 0xe6d1bf6eU, 0xb671e293U, 0x2850a0f0U, 0x172e5c72U, 0x8219322bU, 0x1a34685cU, 0x8b0b161dU, 0xfee1df3eU, 0x8a09121bU, 0x09122436U, 0xc98f038cU, 0x87132635U, 0x4e9c25b9U, 0xe1dfa37cU, 0x2e5cb8e4U, 0xe4d5b762U, 0xe0dda77aU, 0xebcb8b40U, 0x903d7a47U, 0xa455aaffU, 0x1e3c7844U, 0x85172e39U, 0x60c09d5dU, 0x00000000U, 0x254a94deU, 0xf4f5f702U, 0xf1ffe31cU, 0x94356a5fU, 0x0b162c3aU, 0xe7d3bb68U, 0x75eac923U, 0xefc39b58U, 0x3468d0b8U, 0x3162c4a6U, 0xd4b577c2U, 0xd0bd67daU, 0x86112233U, 0x7efce519U, 0xad478ec9U, 0xfde7d334U, 0x2952a4f6U, 0x3060c0a0U, 0x3b76ec9aU, 0x9f234665U, 0xf8edc72aU, 0xc6913faeU, 0x13264c6aU, 0x060c1814U, 0x050a141eU, 0xc59733a4U, 0x11224466U, 0x77eec12fU, 0x7cf8ed15U, 0x7af4f501U, 0x78f0fd0dU, 0x366cd8b4U, 0x1c387048U, 0x3972e496U, 0x59b279cbU, 0x18306050U, 0x56ac45e9U, 0xb37bf68dU, 0xb07dfa87U, 0x244890d8U, 0x204080c0U, 0xb279f28bU, 0x9239724bU, 0xa35bb6edU, 0xc09d27baU, 0x44880d85U, 0x62c49551U, 0x10204060U, 0xb475ea9fU, 0x84152a3fU, 0x43861197U, 0x933b764dU, 0xc2992fb6U, 0x4a9435a1U, 0xbd67cea9U, 0x8f030605U, 0x2d5ab4eeU, 0xbc65caafU, 0x9c254a6fU, 0x6ad4b561U, 0x40801d9dU, 0xcf831b98U, 0xa259b2ebU, 0x801d3a27U, 0x4f9e21bfU, 0x1f3e7c42U, 0xca890f86U, 0xaa4992dbU, 0x42841591U, }; static const u32 T1[256] = { 0x69babbd2U, 0xa854e54dU, 0x5e2fe2bcU, 0xe87425cdU, 0xa653f751U, 0xbbd3d06bU, 0xb9d2d66fU, 0x9a4db329U, 0xa050fd5dU, 0x45accf8aU, 0x078d090eU, 0x63bfa5c6U, 0xe0703dddU, 0xa452f155U, 0x299a7b52U, 0x984cb52dU, 0xc9ea468fU, 0xb7d5c473U, 0x33975566U, 0xbfd1dc63U, 0x6633aaccU, 0xa251fb59U, 0xb65bc771U, 0x51a6f3a2U, 0xa1defe5fU, 0x9048ad3dU, 0x4da8d79aU, 0x2f99715eU, 0xabdbe04bU, 0x6432acc8U, 0x73b795e6U, 0xe5fc32d7U, 0xdbe370abU, 0x219e6342U, 0x3f91417eU, 0x2b9b7d56U, 0xd9e276afU, 0x6bbbbdd6U, 0x82419b19U, 0xdc6e79a5U, 0x57a5f9aeU, 0x8bcb800bU, 0xd66b67b1U, 0x3795596eU, 0x5fa1e1beU, 0xfbf310ebU, 0x7fb181feU, 0x04020c08U, 0x85cc9217U, 0x95c4a237U, 0x3a1d4e74U, 0x28147850U, 0x9bc3b02bU, 0xc6635791U, 0xa9dae64fU, 0xba5dd369U, 0xbe5fdf61U, 0xa5dcf257U, 0xfa7d13e9U, 0x87cd9413U, 0xfe7f1fe1U, 0xb45ac175U, 0xd86c75adU, 0xb85cd56dU, 0xf3f708fbU, 0x4c26d498U, 0xe3ff38dbU, 0xc7ed5493U, 0xcde84a87U, 0x279d694eU, 0xde6f7fa1U, 0x018e0302U, 0x32195664U, 0x5da0e7baU, 0xfdf01ae7U, 0x0f89111eU, 0x1e0f223cU, 0x0e07121cU, 0x43afc586U, 0xebfb20cbU, 0x10083020U, 0x2a157e54U, 0x1a0d2e34U, 0x08041810U, 0x02010604U, 0xc864458dU, 0xa3dff85bU, 0xec7629c5U, 0xf2790bf9U, 0xa7ddf453U, 0x7a3d8ef4U, 0x2c167458U, 0x7e3f82fcU, 0x6e37b2dcU, 0xda6d73a9U, 0x703890e0U, 0x6fb9b1deU, 0xe67337d1U, 0xcfe94c83U, 0x6a35bed4U, 0xaa55e349U, 0xe2713bd9U, 0xf67b07f1U, 0x058c0f0aU, 0xe47231d5U, 0x0d88171aU, 0xf1f60effU, 0x542afca8U, 0x7c3e84f8U, 0xbc5ed965U, 0x4e27d29cU, 0x8c468905U, 0x180c2830U, 0xca654389U, 0xd0686dbdU, 0xc2615b99U, 0x06030a0cU, 0x9fc1bc23U, 0xae57ef41U, 0xb1d6ce7fU, 0xafd9ec43U, 0xb058cd7dU, 0xadd8ea47U, 0xcc664985U, 0xb3d7c87bU, 0x743a9ce8U, 0x8dc88a07U, 0x783c88f0U, 0xe9fa26cfU, 0x31965362U, 0x53a7f5a6U, 0x2d98775aU, 0xc5ec5297U, 0x6db8b7daU, 0x93c7a83bU, 0x41aec382U, 0xd2696bb9U, 0x964ba731U, 0x4babdd96U, 0x4fa9d19eU, 0xce674f81U, 0x140a3c28U, 0x8e478f01U, 0xf9f216efU, 0x77b599eeU, 0x4422cc88U, 0xd7e564b3U, 0xc1ee5e9fU, 0x61bea3c2U, 0x562bfaacU, 0x1f81213eU, 0x24126c48U, 0x1b832d36U, 0x361b5a6cU, 0x1c0e2438U, 0x4623ca8cU, 0xf7f504f3U, 0x8a458309U, 0x4221c684U, 0x81ce9e1fU, 0x9249ab39U, 0x582ce8b0U, 0xeff92cc3U, 0xd1e66ebfU, 0x71b693e2U, 0x5028f0a0U, 0x2e17725cU, 0x19822b32U, 0x341a5c68U, 0x0b8b1d16U, 0xe1fe3edfU, 0x098a1b12U, 0x12093624U, 0x8fc98c03U, 0x13873526U, 0x9c4eb925U, 0xdfe17ca3U, 0x5c2ee4b8U, 0xd5e462b7U, 0xdde07aa7U, 0xcbeb408bU, 0x3d90477aU, 0x55a4ffaaU, 0x3c1e4478U, 0x1785392eU, 0xc0605d9dU, 0x00000000U, 0x4a25de94U, 0xf5f402f7U, 0xfff11ce3U, 0x35945f6aU, 0x160b3a2cU, 0xd3e768bbU, 0xea7523c9U, 0xc3ef589bU, 0x6834b8d0U, 0x6231a6c4U, 0xb5d4c277U, 0xbdd0da67U, 0x11863322U, 0xfc7e19e5U, 0x47adc98eU, 0xe7fd34d3U, 0x5229f6a4U, 0x6030a0c0U, 0x763b9aecU, 0x239f6546U, 0xedf82ac7U, 0x91c6ae3fU, 0x26136a4cU, 0x0c061418U, 0x0a051e14U, 0x97c5a433U, 0x22116644U, 0xee772fc1U, 0xf87c15edU, 0xf47a01f5U, 0xf0780dfdU, 0x6c36b4d8U, 0x381c4870U, 0x723996e4U, 0xb259cb79U, 0x30185060U, 0xac56e945U, 0x7bb38df6U, 0x7db087faU, 0x4824d890U, 0x4020c080U, 0x79b28bf2U, 0x39924b72U, 0x5ba3edb6U, 0x9dc0ba27U, 0x8844850dU, 0xc4625195U, 0x20106040U, 0x75b49feaU, 0x15843f2aU, 0x86439711U, 0x3b934d76U, 0x99c2b62fU, 0x944aa135U, 0x67bda9ceU, 0x038f0506U, 0x5a2deeb4U, 0x65bcafcaU, 0x259c6f4aU, 0xd46a61b5U, 0x80409d1dU, 0x83cf981bU, 0x59a2ebb2U, 0x1d80273aU, 0x9e4fbf21U, 0x3e1f427cU, 0x89ca860fU, 0x49aadb92U, 0x84429115U, }; static const u32 T2[256] = { 0xd2bbba69U, 0x4de554a8U, 0xbce22f5eU, 0xcd2574e8U, 0x51f753a6U, 0x6bd0d3bbU, 0x6fd6d2b9U, 0x29b34d9aU, 0x5dfd50a0U, 0x8acfac45U, 0x0e098d07U, 0xc6a5bf63U, 0xdd3d70e0U, 0x55f152a4U, 0x527b9a29U, 0x2db54c98U, 0x8f46eac9U, 0x73c4d5b7U, 0x66559733U, 0x63dcd1bfU, 0xccaa3366U, 0x59fb51a2U, 0x71c75bb6U, 0xa2f3a651U, 0x5ffedea1U, 0x3dad4890U, 0x9ad7a84dU, 0x5e71992fU, 0x4be0dbabU, 0xc8ac3264U, 0xe695b773U, 0xd732fce5U, 0xab70e3dbU, 0x42639e21U, 0x7e41913fU, 0x567d9b2bU, 0xaf76e2d9U, 0xd6bdbb6bU, 0x199b4182U, 0xa5796edcU, 0xaef9a557U, 0x0b80cb8bU, 0xb1676bd6U, 0x6e599537U, 0xbee1a15fU, 0xeb10f3fbU, 0xfe81b17fU, 0x080c0204U, 0x1792cc85U, 0x37a2c495U, 0x744e1d3aU, 0x50781428U, 0x2bb0c39bU, 0x915763c6U, 0x4fe6daa9U, 0x69d35dbaU, 0x61df5fbeU, 0x57f2dca5U, 0xe9137dfaU, 0x1394cd87U, 0xe11f7ffeU, 0x75c15ab4U, 0xad756cd8U, 0x6dd55cb8U, 0xfb08f7f3U, 0x98d4264cU, 0xdb38ffe3U, 0x9354edc7U, 0x874ae8cdU, 0x4e699d27U, 0xa17f6fdeU, 0x02038e01U, 0x64561932U, 0xbae7a05dU, 0xe71af0fdU, 0x1e11890fU, 0x3c220f1eU, 0x1c12070eU, 0x86c5af43U, 0xcb20fbebU, 0x20300810U, 0x547e152aU, 0x342e0d1aU, 0x10180408U, 0x04060102U, 0x8d4564c8U, 0x5bf8dfa3U, 0xc52976ecU, 0xf90b79f2U, 0x53f4dda7U, 0xf48e3d7aU, 0x5874162cU, 0xfc823f7eU, 0xdcb2376eU, 0xa9736ddaU, 0xe0903870U, 0xdeb1b96fU, 0xd13773e6U, 0x834ce9cfU, 0xd4be356aU, 0x49e355aaU, 0xd93b71e2U, 0xf1077bf6U, 0x0a0f8c05U, 0xd53172e4U, 0x1a17880dU, 0xff0ef6f1U, 0xa8fc2a54U, 0xf8843e7cU, 0x65d95ebcU, 0x9cd2274eU, 0x0589468cU, 0x30280c18U, 0x894365caU, 0xbd6d68d0U, 0x995b61c2U, 0x0c0a0306U, 0x23bcc19fU, 0x41ef57aeU, 0x7fced6b1U, 0x43ecd9afU, 0x7dcd58b0U, 0x47ead8adU, 0x854966ccU, 0x7bc8d7b3U, 0xe89c3a74U, 0x078ac88dU, 0xf0883c78U, 0xcf26fae9U, 0x62539631U, 0xa6f5a753U, 0x5a77982dU, 0x9752ecc5U, 0xdab7b86dU, 0x3ba8c793U, 0x82c3ae41U, 0xb96b69d2U, 0x31a74b96U, 0x96ddab4bU, 0x9ed1a94fU, 0x814f67ceU, 0x283c0a14U, 0x018f478eU, 0xef16f2f9U, 0xee99b577U, 0x88cc2244U, 0xb364e5d7U, 0x9f5eeec1U, 0xc2a3be61U, 0xacfa2b56U, 0x3e21811fU, 0x486c1224U, 0x362d831bU, 0x6c5a1b36U, 0x38240e1cU, 0x8cca2346U, 0xf304f5f7U, 0x0983458aU, 0x84c62142U, 0x1f9ece81U, 0x39ab4992U, 0xb0e82c58U, 0xc32cf9efU, 0xbf6ee6d1U, 0xe293b671U, 0xa0f02850U, 0x5c72172eU, 0x322b8219U, 0x685c1a34U, 0x161d8b0bU, 0xdf3efee1U, 0x121b8a09U, 0x24360912U, 0x038cc98fU, 0x26358713U, 0x25b94e9cU, 0xa37ce1dfU, 0xb8e42e5cU, 0xb762e4d5U, 0xa77ae0ddU, 0x8b40ebcbU, 0x7a47903dU, 0xaaffa455U, 0x78441e3cU, 0x2e398517U, 0x9d5d60c0U, 0x00000000U, 0x94de254aU, 0xf702f4f5U, 0xe31cf1ffU, 0x6a5f9435U, 0x2c3a0b16U, 0xbb68e7d3U, 0xc92375eaU, 0x9b58efc3U, 0xd0b83468U, 0xc4a63162U, 0x77c2d4b5U, 0x67dad0bdU, 0x22338611U, 0xe5197efcU, 0x8ec9ad47U, 0xd334fde7U, 0xa4f62952U, 0xc0a03060U, 0xec9a3b76U, 0x46659f23U, 0xc72af8edU, 0x3faec691U, 0x4c6a1326U, 0x1814060cU, 0x141e050aU, 0x33a4c597U, 0x44661122U, 0xc12f77eeU, 0xed157cf8U, 0xf5017af4U, 0xfd0d78f0U, 0xd8b4366cU, 0x70481c38U, 0xe4963972U, 0x79cb59b2U, 0x60501830U, 0x45e956acU, 0xf68db37bU, 0xfa87b07dU, 0x90d82448U, 0x80c02040U, 0xf28bb279U, 0x724b9239U, 0xb6eda35bU, 0x27bac09dU, 0x0d854488U, 0x955162c4U, 0x40601020U, 0xea9fb475U, 0x2a3f8415U, 0x11974386U, 0x764d933bU, 0x2fb6c299U, 0x35a14a94U, 0xcea9bd67U, 0x06058f03U, 0xb4ee2d5aU, 0xcaafbc65U, 0x4a6f9c25U, 0xb5616ad4U, 0x1d9d4080U, 0x1b98cf83U, 0xb2eba259U, 0x3a27801dU, 0x21bf4f9eU, 0x7c421f3eU, 0x0f86ca89U, 0x92dbaa49U, 0x15914284U, }; static const u32 T3[256] = { 0xbbd269baU, 0xe54da854U, 0xe2bc5e2fU, 0x25cde874U, 0xf751a653U, 0xd06bbbd3U, 0xd66fb9d2U, 0xb3299a4dU, 0xfd5da050U, 0xcf8a45acU, 0x090e078dU, 0xa5c663bfU, 0x3ddde070U, 0xf155a452U, 0x7b52299aU, 0xb52d984cU, 0x468fc9eaU, 0xc473b7d5U, 0x55663397U, 0xdc63bfd1U, 0xaacc6633U, 0xfb59a251U, 0xc771b65bU, 0xf3a251a6U, 0xfe5fa1deU, 0xad3d9048U, 0xd79a4da8U, 0x715e2f99U, 0xe04babdbU, 0xacc86432U, 0x95e673b7U, 0x32d7e5fcU, 0x70abdbe3U, 0x6342219eU, 0x417e3f91U, 0x7d562b9bU, 0x76afd9e2U, 0xbdd66bbbU, 0x9b198241U, 0x79a5dc6eU, 0xf9ae57a5U, 0x800b8bcbU, 0x67b1d66bU, 0x596e3795U, 0xe1be5fa1U, 0x10ebfbf3U, 0x81fe7fb1U, 0x0c080402U, 0x921785ccU, 0xa23795c4U, 0x4e743a1dU, 0x78502814U, 0xb02b9bc3U, 0x5791c663U, 0xe64fa9daU, 0xd369ba5dU, 0xdf61be5fU, 0xf257a5dcU, 0x13e9fa7dU, 0x941387cdU, 0x1fe1fe7fU, 0xc175b45aU, 0x75add86cU, 0xd56db85cU, 0x08fbf3f7U, 0xd4984c26U, 0x38dbe3ffU, 0x5493c7edU, 0x4a87cde8U, 0x694e279dU, 0x7fa1de6fU, 0x0302018eU, 0x56643219U, 0xe7ba5da0U, 0x1ae7fdf0U, 0x111e0f89U, 0x223c1e0fU, 0x121c0e07U, 0xc58643afU, 0x20cbebfbU, 0x30201008U, 0x7e542a15U, 0x2e341a0dU, 0x18100804U, 0x06040201U, 0x458dc864U, 0xf85ba3dfU, 0x29c5ec76U, 0x0bf9f279U, 0xf453a7ddU, 0x8ef47a3dU, 0x74582c16U, 0x82fc7e3fU, 0xb2dc6e37U, 0x73a9da6dU, 0x90e07038U, 0xb1de6fb9U, 0x37d1e673U, 0x4c83cfe9U, 0xbed46a35U, 0xe349aa55U, 0x3bd9e271U, 0x07f1f67bU, 0x0f0a058cU, 0x31d5e472U, 0x171a0d88U, 0x0efff1f6U, 0xfca8542aU, 0x84f87c3eU, 0xd965bc5eU, 0xd29c4e27U, 0x89058c46U, 0x2830180cU, 0x4389ca65U, 0x6dbdd068U, 0x5b99c261U, 0x0a0c0603U, 0xbc239fc1U, 0xef41ae57U, 0xce7fb1d6U, 0xec43afd9U, 0xcd7db058U, 0xea47add8U, 0x4985cc66U, 0xc87bb3d7U, 0x9ce8743aU, 0x8a078dc8U, 0x88f0783cU, 0x26cfe9faU, 0x53623196U, 0xf5a653a7U, 0x775a2d98U, 0x5297c5ecU, 0xb7da6db8U, 0xa83b93c7U, 0xc38241aeU, 0x6bb9d269U, 0xa731964bU, 0xdd964babU, 0xd19e4fa9U, 0x4f81ce67U, 0x3c28140aU, 0x8f018e47U, 0x16eff9f2U, 0x99ee77b5U, 0xcc884422U, 0x64b3d7e5U, 0x5e9fc1eeU, 0xa3c261beU, 0xfaac562bU, 0x213e1f81U, 0x6c482412U, 0x2d361b83U, 0x5a6c361bU, 0x24381c0eU, 0xca8c4623U, 0x04f3f7f5U, 0x83098a45U, 0xc6844221U, 0x9e1f81ceU, 0xab399249U, 0xe8b0582cU, 0x2cc3eff9U, 0x6ebfd1e6U, 0x93e271b6U, 0xf0a05028U, 0x725c2e17U, 0x2b321982U, 0x5c68341aU, 0x1d160b8bU, 0x3edfe1feU, 0x1b12098aU, 0x36241209U, 0x8c038fc9U, 0x35261387U, 0xb9259c4eU, 0x7ca3dfe1U, 0xe4b85c2eU, 0x62b7d5e4U, 0x7aa7dde0U, 0x408bcbebU, 0x477a3d90U, 0xffaa55a4U, 0x44783c1eU, 0x392e1785U, 0x5d9dc060U, 0x00000000U, 0xde944a25U, 0x02f7f5f4U, 0x1ce3fff1U, 0x5f6a3594U, 0x3a2c160bU, 0x68bbd3e7U, 0x23c9ea75U, 0x589bc3efU, 0xb8d06834U, 0xa6c46231U, 0xc277b5d4U, 0xda67bdd0U, 0x33221186U, 0x19e5fc7eU, 0xc98e47adU, 0x34d3e7fdU, 0xf6a45229U, 0xa0c06030U, 0x9aec763bU, 0x6546239fU, 0x2ac7edf8U, 0xae3f91c6U, 0x6a4c2613U, 0x14180c06U, 0x1e140a05U, 0xa43397c5U, 0x66442211U, 0x2fc1ee77U, 0x15edf87cU, 0x01f5f47aU, 0x0dfdf078U, 0xb4d86c36U, 0x4870381cU, 0x96e47239U, 0xcb79b259U, 0x50603018U, 0xe945ac56U, 0x8df67bb3U, 0x87fa7db0U, 0xd8904824U, 0xc0804020U, 0x8bf279b2U, 0x4b723992U, 0xedb65ba3U, 0xba279dc0U, 0x850d8844U, 0x5195c462U, 0x60402010U, 0x9fea75b4U, 0x3f2a1584U, 0x97118643U, 0x4d763b93U, 0xb62f99c2U, 0xa135944aU, 0xa9ce67bdU, 0x0506038fU, 0xeeb45a2dU, 0xafca65bcU, 0x6f4a259cU, 0x61b5d46aU, 0x9d1d8040U, 0x981b83cfU, 0xebb259a2U, 0x273a1d80U, 0xbf219e4fU, 0x427c3e1fU, 0x860f89caU, 0xdb9249aaU, 0x91158442U, }; static const u32 T4[256] = { 0xbabababaU, 0x54545454U, 0x2f2f2f2fU, 0x74747474U, 0x53535353U, 0xd3d3d3d3U, 0xd2d2d2d2U, 0x4d4d4d4dU, 0x50505050U, 0xacacacacU, 0x8d8d8d8dU, 0xbfbfbfbfU, 0x70707070U, 0x52525252U, 0x9a9a9a9aU, 0x4c4c4c4cU, 0xeaeaeaeaU, 0xd5d5d5d5U, 0x97979797U, 0xd1d1d1d1U, 0x33333333U, 0x51515151U, 0x5b5b5b5bU, 0xa6a6a6a6U, 0xdedededeU, 0x48484848U, 0xa8a8a8a8U, 0x99999999U, 0xdbdbdbdbU, 0x32323232U, 0xb7b7b7b7U, 0xfcfcfcfcU, 0xe3e3e3e3U, 0x9e9e9e9eU, 0x91919191U, 0x9b9b9b9bU, 0xe2e2e2e2U, 0xbbbbbbbbU, 0x41414141U, 0x6e6e6e6eU, 0xa5a5a5a5U, 0xcbcbcbcbU, 0x6b6b6b6bU, 0x95959595U, 0xa1a1a1a1U, 0xf3f3f3f3U, 0xb1b1b1b1U, 0x02020202U, 0xccccccccU, 0xc4c4c4c4U, 0x1d1d1d1dU, 0x14141414U, 0xc3c3c3c3U, 0x63636363U, 0xdadadadaU, 0x5d5d5d5dU, 0x5f5f5f5fU, 0xdcdcdcdcU, 0x7d7d7d7dU, 0xcdcdcdcdU, 0x7f7f7f7fU, 0x5a5a5a5aU, 0x6c6c6c6cU, 0x5c5c5c5cU, 0xf7f7f7f7U, 0x26262626U, 0xffffffffU, 0xededededU, 0xe8e8e8e8U, 0x9d9d9d9dU, 0x6f6f6f6fU, 0x8e8e8e8eU, 0x19191919U, 0xa0a0a0a0U, 0xf0f0f0f0U, 0x89898989U, 0x0f0f0f0fU, 0x07070707U, 0xafafafafU, 0xfbfbfbfbU, 0x08080808U, 0x15151515U, 0x0d0d0d0dU, 0x04040404U, 0x01010101U, 0x64646464U, 0xdfdfdfdfU, 0x76767676U, 0x79797979U, 0xddddddddU, 0x3d3d3d3dU, 0x16161616U, 0x3f3f3f3fU, 0x37373737U, 0x6d6d6d6dU, 0x38383838U, 0xb9b9b9b9U, 0x73737373U, 0xe9e9e9e9U, 0x35353535U, 0x55555555U, 0x71717171U, 0x7b7b7b7bU, 0x8c8c8c8cU, 0x72727272U, 0x88888888U, 0xf6f6f6f6U, 0x2a2a2a2aU, 0x3e3e3e3eU, 0x5e5e5e5eU, 0x27272727U, 0x46464646U, 0x0c0c0c0cU, 0x65656565U, 0x68686868U, 0x61616161U, 0x03030303U, 0xc1c1c1c1U, 0x57575757U, 0xd6d6d6d6U, 0xd9d9d9d9U, 0x58585858U, 0xd8d8d8d8U, 0x66666666U, 0xd7d7d7d7U, 0x3a3a3a3aU, 0xc8c8c8c8U, 0x3c3c3c3cU, 0xfafafafaU, 0x96969696U, 0xa7a7a7a7U, 0x98989898U, 0xececececU, 0xb8b8b8b8U, 0xc7c7c7c7U, 0xaeaeaeaeU, 0x69696969U, 0x4b4b4b4bU, 0xababababU, 0xa9a9a9a9U, 0x67676767U, 0x0a0a0a0aU, 0x47474747U, 0xf2f2f2f2U, 0xb5b5b5b5U, 0x22222222U, 0xe5e5e5e5U, 0xeeeeeeeeU, 0xbebebebeU, 0x2b2b2b2bU, 0x81818181U, 0x12121212U, 0x83838383U, 0x1b1b1b1bU, 0x0e0e0e0eU, 0x23232323U, 0xf5f5f5f5U, 0x45454545U, 0x21212121U, 0xcecececeU, 0x49494949U, 0x2c2c2c2cU, 0xf9f9f9f9U, 0xe6e6e6e6U, 0xb6b6b6b6U, 0x28282828U, 0x17171717U, 0x82828282U, 0x1a1a1a1aU, 0x8b8b8b8bU, 0xfefefefeU, 0x8a8a8a8aU, 0x09090909U, 0xc9c9c9c9U, 0x87878787U, 0x4e4e4e4eU, 0xe1e1e1e1U, 0x2e2e2e2eU, 0xe4e4e4e4U, 0xe0e0e0e0U, 0xebebebebU, 0x90909090U, 0xa4a4a4a4U, 0x1e1e1e1eU, 0x85858585U, 0x60606060U, 0x00000000U, 0x25252525U, 0xf4f4f4f4U, 0xf1f1f1f1U, 0x94949494U, 0x0b0b0b0bU, 0xe7e7e7e7U, 0x75757575U, 0xefefefefU, 0x34343434U, 0x31313131U, 0xd4d4d4d4U, 0xd0d0d0d0U, 0x86868686U, 0x7e7e7e7eU, 0xadadadadU, 0xfdfdfdfdU, 0x29292929U, 0x30303030U, 0x3b3b3b3bU, 0x9f9f9f9fU, 0xf8f8f8f8U, 0xc6c6c6c6U, 0x13131313U, 0x06060606U, 0x05050505U, 0xc5c5c5c5U, 0x11111111U, 0x77777777U, 0x7c7c7c7cU, 0x7a7a7a7aU, 0x78787878U, 0x36363636U, 0x1c1c1c1cU, 0x39393939U, 0x59595959U, 0x18181818U, 0x56565656U, 0xb3b3b3b3U, 0xb0b0b0b0U, 0x24242424U, 0x20202020U, 0xb2b2b2b2U, 0x92929292U, 0xa3a3a3a3U, 0xc0c0c0c0U, 0x44444444U, 0x62626262U, 0x10101010U, 0xb4b4b4b4U, 0x84848484U, 0x43434343U, 0x93939393U, 0xc2c2c2c2U, 0x4a4a4a4aU, 0xbdbdbdbdU, 0x8f8f8f8fU, 0x2d2d2d2dU, 0xbcbcbcbcU, 0x9c9c9c9cU, 0x6a6a6a6aU, 0x40404040U, 0xcfcfcfcfU, 0xa2a2a2a2U, 0x80808080U, 0x4f4f4f4fU, 0x1f1f1f1fU, 0xcacacacaU, 0xaaaaaaaaU, 0x42424242U, }; static const u32 T5[256] = { 0x00000000U, 0x01020608U, 0x02040c10U, 0x03060a18U, 0x04081820U, 0x050a1e28U, 0x060c1430U, 0x070e1238U, 0x08103040U, 0x09123648U, 0x0a143c50U, 0x0b163a58U, 0x0c182860U, 0x0d1a2e68U, 0x0e1c2470U, 0x0f1e2278U, 0x10206080U, 0x11226688U, 0x12246c90U, 0x13266a98U, 0x142878a0U, 0x152a7ea8U, 0x162c74b0U, 0x172e72b8U, 0x183050c0U, 0x193256c8U, 0x1a345cd0U, 0x1b365ad8U, 0x1c3848e0U, 0x1d3a4ee8U, 0x1e3c44f0U, 0x1f3e42f8U, 0x2040c01dU, 0x2142c615U, 0x2244cc0dU, 0x2346ca05U, 0x2448d83dU, 0x254ade35U, 0x264cd42dU, 0x274ed225U, 0x2850f05dU, 0x2952f655U, 0x2a54fc4dU, 0x2b56fa45U, 0x2c58e87dU, 0x2d5aee75U, 0x2e5ce46dU, 0x2f5ee265U, 0x3060a09dU, 0x3162a695U, 0x3264ac8dU, 0x3366aa85U, 0x3468b8bdU, 0x356abeb5U, 0x366cb4adU, 0x376eb2a5U, 0x387090ddU, 0x397296d5U, 0x3a749ccdU, 0x3b769ac5U, 0x3c7888fdU, 0x3d7a8ef5U, 0x3e7c84edU, 0x3f7e82e5U, 0x40809d3aU, 0x41829b32U, 0x4284912aU, 0x43869722U, 0x4488851aU, 0x458a8312U, 0x468c890aU, 0x478e8f02U, 0x4890ad7aU, 0x4992ab72U, 0x4a94a16aU, 0x4b96a762U, 0x4c98b55aU, 0x4d9ab352U, 0x4e9cb94aU, 0x4f9ebf42U, 0x50a0fdbaU, 0x51a2fbb2U, 0x52a4f1aaU, 0x53a6f7a2U, 0x54a8e59aU, 0x55aae392U, 0x56ace98aU, 0x57aeef82U, 0x58b0cdfaU, 0x59b2cbf2U, 0x5ab4c1eaU, 0x5bb6c7e2U, 0x5cb8d5daU, 0x5dbad3d2U, 0x5ebcd9caU, 0x5fbedfc2U, 0x60c05d27U, 0x61c25b2fU, 0x62c45137U, 0x63c6573fU, 0x64c84507U, 0x65ca430fU, 0x66cc4917U, 0x67ce4f1fU, 0x68d06d67U, 0x69d26b6fU, 0x6ad46177U, 0x6bd6677fU, 0x6cd87547U, 0x6dda734fU, 0x6edc7957U, 0x6fde7f5fU, 0x70e03da7U, 0x71e23bafU, 0x72e431b7U, 0x73e637bfU, 0x74e82587U, 0x75ea238fU, 0x76ec2997U, 0x77ee2f9fU, 0x78f00de7U, 0x79f20befU, 0x7af401f7U, 0x7bf607ffU, 0x7cf815c7U, 0x7dfa13cfU, 0x7efc19d7U, 0x7ffe1fdfU, 0x801d2774U, 0x811f217cU, 0x82192b64U, 0x831b2d6cU, 0x84153f54U, 0x8517395cU, 0x86113344U, 0x8713354cU, 0x880d1734U, 0x890f113cU, 0x8a091b24U, 0x8b0b1d2cU, 0x8c050f14U, 0x8d07091cU, 0x8e010304U, 0x8f03050cU, 0x903d47f4U, 0x913f41fcU, 0x92394be4U, 0x933b4decU, 0x94355fd4U, 0x953759dcU, 0x963153c4U, 0x973355ccU, 0x982d77b4U, 0x992f71bcU, 0x9a297ba4U, 0x9b2b7dacU, 0x9c256f94U, 0x9d27699cU, 0x9e216384U, 0x9f23658cU, 0xa05de769U, 0xa15fe161U, 0xa259eb79U, 0xa35bed71U, 0xa455ff49U, 0xa557f941U, 0xa651f359U, 0xa753f551U, 0xa84dd729U, 0xa94fd121U, 0xaa49db39U, 0xab4bdd31U, 0xac45cf09U, 0xad47c901U, 0xae41c319U, 0xaf43c511U, 0xb07d87e9U, 0xb17f81e1U, 0xb2798bf9U, 0xb37b8df1U, 0xb4759fc9U, 0xb57799c1U, 0xb67193d9U, 0xb77395d1U, 0xb86db7a9U, 0xb96fb1a1U, 0xba69bbb9U, 0xbb6bbdb1U, 0xbc65af89U, 0xbd67a981U, 0xbe61a399U, 0xbf63a591U, 0xc09dba4eU, 0xc19fbc46U, 0xc299b65eU, 0xc39bb056U, 0xc495a26eU, 0xc597a466U, 0xc691ae7eU, 0xc793a876U, 0xc88d8a0eU, 0xc98f8c06U, 0xca89861eU, 0xcb8b8016U, 0xcc85922eU, 0xcd879426U, 0xce819e3eU, 0xcf839836U, 0xd0bddaceU, 0xd1bfdcc6U, 0xd2b9d6deU, 0xd3bbd0d6U, 0xd4b5c2eeU, 0xd5b7c4e6U, 0xd6b1cefeU, 0xd7b3c8f6U, 0xd8adea8eU, 0xd9afec86U, 0xdaa9e69eU, 0xdbabe096U, 0xdca5f2aeU, 0xdda7f4a6U, 0xdea1febeU, 0xdfa3f8b6U, 0xe0dd7a53U, 0xe1df7c5bU, 0xe2d97643U, 0xe3db704bU, 0xe4d56273U, 0xe5d7647bU, 0xe6d16e63U, 0xe7d3686bU, 0xe8cd4a13U, 0xe9cf4c1bU, 0xeac94603U, 0xebcb400bU, 0xecc55233U, 0xedc7543bU, 0xeec15e23U, 0xefc3582bU, 0xf0fd1ad3U, 0xf1ff1cdbU, 0xf2f916c3U, 0xf3fb10cbU, 0xf4f502f3U, 0xf5f704fbU, 0xf6f10ee3U, 0xf7f308ebU, 0xf8ed2a93U, 0xf9ef2c9bU, 0xfae92683U, 0xfbeb208bU, 0xfce532b3U, 0xfde734bbU, 0xfee13ea3U, 0xffe338abU, }; static const u32 rc[] = { 0xba542f74U, 0x53d3d24dU, 0x50ac8dbfU, 0x70529a4cU, 0xead597d1U, 0x33515ba6U, 0xde48a899U, 0xdb32b7fcU, 0xe39e919bU, 0xe2bb416eU, 0xa5cb6b95U, 0xa1f3b102U, 0xccc41d14U, 0xc363da5dU, 0x5fdc7dcdU, 0x7f5a6c5cU, 0xf726ffedU, 0xe89d6f8eU, 0x19a0f089U, }; static int anubis_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct anubis_ctx *ctx = crypto_tfm_ctx(tfm); const __be32 *key = (const __be32 *)in_key; int N, R, i, r; u32 kappa[ANUBIS_MAX_N]; u32 inter[ANUBIS_MAX_N]; switch (key_len) { case 16: case 20: case 24: case 28: case 32: case 36: case 40: break; default: return -EINVAL; } ctx->key_len = key_len * 8; N = ctx->key_len >> 5; ctx->R = R = 8 + N; /* * map cipher key to initial key state (mu): */ for (i = 0; i < N; i++) kappa[i] = be32_to_cpu(key[i]); /* * generate R + 1 round keys: */ for (r = 0; r <= R; r++) { u32 K0, K1, K2, K3; /* * generate r-th round key K^r: */ K0 = T4[(kappa[N - 1] >> 24) ]; K1 = T4[(kappa[N - 1] >> 16) & 0xff]; K2 = T4[(kappa[N - 1] >> 8) & 0xff]; K3 = T4[(kappa[N - 1] ) & 0xff]; for (i = N - 2; i >= 0; i--) { K0 = T4[(kappa[i] >> 24) ] ^ (T5[(K0 >> 24) ] & 0xff000000U) ^ (T5[(K0 >> 16) & 0xff] & 0x00ff0000U) ^ (T5[(K0 >> 8) & 0xff] & 0x0000ff00U) ^ (T5[(K0 ) & 0xff] & 0x000000ffU); K1 = T4[(kappa[i] >> 16) & 0xff] ^ (T5[(K1 >> 24) ] & 0xff000000U) ^ (T5[(K1 >> 16) & 0xff] & 0x00ff0000U) ^ (T5[(K1 >> 8) & 0xff] & 0x0000ff00U) ^ (T5[(K1 ) & 0xff] & 0x000000ffU); K2 = T4[(kappa[i] >> 8) & 0xff] ^ (T5[(K2 >> 24) ] & 0xff000000U) ^ (T5[(K2 >> 16) & 0xff] & 0x00ff0000U) ^ (T5[(K2 >> 8) & 0xff] & 0x0000ff00U) ^ (T5[(K2 ) & 0xff] & 0x000000ffU); K3 = T4[(kappa[i] ) & 0xff] ^ (T5[(K3 >> 24) ] & 0xff000000U) ^ (T5[(K3 >> 16) & 0xff] & 0x00ff0000U) ^ (T5[(K3 >> 8) & 0xff] & 0x0000ff00U) ^ (T5[(K3 ) & 0xff] & 0x000000ffU); } ctx->E[r][0] = K0; ctx->E[r][1] = K1; ctx->E[r][2] = K2; ctx->E[r][3] = K3; /* * compute kappa^{r+1} from kappa^r: */ if (r == R) break; for (i = 0; i < N; i++) { int j = i; inter[i] = T0[(kappa[j--] >> 24) ]; if (j < 0) j = N - 1; inter[i] ^= T1[(kappa[j--] >> 16) & 0xff]; if (j < 0) j = N - 1; inter[i] ^= T2[(kappa[j--] >> 8) & 0xff]; if (j < 0) j = N - 1; inter[i] ^= T3[(kappa[j ] ) & 0xff]; } kappa[0] = inter[0] ^ rc[r]; for (i = 1; i < N; i++) kappa[i] = inter[i]; } /* * generate inverse key schedule: K'^0 = K^R, K'^R = * K^0, K'^r = theta(K^{R-r}): */ for (i = 0; i < 4; i++) { ctx->D[0][i] = ctx->E[R][i]; ctx->D[R][i] = ctx->E[0][i]; } for (r = 1; r < R; r++) { for (i = 0; i < 4; i++) { u32 v = ctx->E[R - r][i]; ctx->D[r][i] = T0[T4[(v >> 24) ] & 0xff] ^ T1[T4[(v >> 16) & 0xff] & 0xff] ^ T2[T4[(v >> 8) & 0xff] & 0xff] ^ T3[T4[(v ) & 0xff] & 0xff]; } } return 0; } static void anubis_crypt(u32 roundKey[ANUBIS_MAX_ROUNDS + 1][4], u8 *ciphertext, const u8 *plaintext, const int R) { const __be32 *src = (const __be32 *)plaintext; __be32 *dst = (__be32 *)ciphertext; int i, r; u32 state[4]; u32 inter[4]; /* * map plaintext block to cipher state (mu) * and add initial round key (sigma[K^0]): */ for (i = 0; i < 4; i++) state[i] = be32_to_cpu(src[i]) ^ roundKey[0][i]; /* * R - 1 full rounds: */ for (r = 1; r < R; r++) { inter[0] = T0[(state[0] >> 24) ] ^ T1[(state[1] >> 24) ] ^ T2[(state[2] >> 24) ] ^ T3[(state[3] >> 24) ] ^ roundKey[r][0]; inter[1] = T0[(state[0] >> 16) & 0xff] ^ T1[(state[1] >> 16) & 0xff] ^ T2[(state[2] >> 16) & 0xff] ^ T3[(state[3] >> 16) & 0xff] ^ roundKey[r][1]; inter[2] = T0[(state[0] >> 8) & 0xff] ^ T1[(state[1] >> 8) & 0xff] ^ T2[(state[2] >> 8) & 0xff] ^ T3[(state[3] >> 8) & 0xff] ^ roundKey[r][2]; inter[3] = T0[(state[0] ) & 0xff] ^ T1[(state[1] ) & 0xff] ^ T2[(state[2] ) & 0xff] ^ T3[(state[3] ) & 0xff] ^ roundKey[r][3]; state[0] = inter[0]; state[1] = inter[1]; state[2] = inter[2]; state[3] = inter[3]; } /* * last round: */ inter[0] = (T0[(state[0] >> 24) ] & 0xff000000U) ^ (T1[(state[1] >> 24) ] & 0x00ff0000U) ^ (T2[(state[2] >> 24) ] & 0x0000ff00U) ^ (T3[(state[3] >> 24) ] & 0x000000ffU) ^ roundKey[R][0]; inter[1] = (T0[(state[0] >> 16) & 0xff] & 0xff000000U) ^ (T1[(state[1] >> 16) & 0xff] & 0x00ff0000U) ^ (T2[(state[2] >> 16) & 0xff] & 0x0000ff00U) ^ (T3[(state[3] >> 16) & 0xff] & 0x000000ffU) ^ roundKey[R][1]; inter[2] = (T0[(state[0] >> 8) & 0xff] & 0xff000000U) ^ (T1[(state[1] >> 8) & 0xff] & 0x00ff0000U) ^ (T2[(state[2] >> 8) & 0xff] & 0x0000ff00U) ^ (T3[(state[3] >> 8) & 0xff] & 0x000000ffU) ^ roundKey[R][2]; inter[3] = (T0[(state[0] ) & 0xff] & 0xff000000U) ^ (T1[(state[1] ) & 0xff] & 0x00ff0000U) ^ (T2[(state[2] ) & 0xff] & 0x0000ff00U) ^ (T3[(state[3] ) & 0xff] & 0x000000ffU) ^ roundKey[R][3]; /* * map cipher state to ciphertext block (mu^{-1}): */ for (i = 0; i < 4; i++) dst[i] = cpu_to_be32(inter[i]); } static void anubis_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct anubis_ctx *ctx = crypto_tfm_ctx(tfm); anubis_crypt(ctx->E, dst, src, ctx->R); } static void anubis_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct anubis_ctx *ctx = crypto_tfm_ctx(tfm); anubis_crypt(ctx->D, dst, src, ctx->R); } static struct crypto_alg anubis_alg = { .cra_name = "anubis", .cra_driver_name = "anubis-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = ANUBIS_BLOCK_SIZE, .cra_ctxsize = sizeof (struct anubis_ctx), .cra_alignmask = 3, .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = ANUBIS_MIN_KEY_SIZE, .cia_max_keysize = ANUBIS_MAX_KEY_SIZE, .cia_setkey = anubis_setkey, .cia_encrypt = anubis_encrypt, .cia_decrypt = anubis_decrypt } } }; static int __init anubis_mod_init(void) { int ret = 0; ret = crypto_register_alg(&anubis_alg); return ret; } static void __exit anubis_mod_fini(void) { crypto_unregister_alg(&anubis_alg); } subsys_initcall(anubis_mod_init); module_exit(anubis_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Anubis Cryptographic Algorithm"); MODULE_ALIAS_CRYPTO("anubis"); |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/rculist.h> #include <linux/llist.h> #include "rds_single_path.h" #include "ib_mr.h" #include "rds.h" struct workqueue_struct *rds_ib_mr_wq; static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; rcu_read_lock(); list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { refcount_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } } } rcu_read_unlock(); return NULL; } static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); if (!i_ipaddr) return -ENOMEM; i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; } static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; struct rds_ib_ipaddr *to_free = NULL; spin_lock_irq(&rds_ibdev->spinlock); list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { list_del_rcu(&i_ipaddr->list); to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); if (to_free) kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, struct in6_addr *ipaddr) { struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); if (!rds_ibdev_old) return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); rds_ib_dev_put(rds_ibdev_old); return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); } rds_ib_dev_put(rds_ibdev_old); return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* conn was previously on the nodev_conns_list */ spin_lock_irq(&ib_nodev_conns_lock); BUG_ON(list_empty(&ib_nodev_conns)); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; refcount_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* place conn on nodev_conns_list */ spin_lock(&ib_nodev_conns_lock); spin_lock_irq(&rds_ibdev->spinlock); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_unlock_irq(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &ib_nodev_conns); spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; rds_ib_dev_put(rds_ibdev); } void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&ib_nodev_conns_lock); list_splice(&ib_nodev_conns, &tmp_list); spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo->rdma_mr_max = pool_1m->max_items; iinfo->rdma_mr_size = pool_1m->max_pages; } #if IS_ENABLED(CONFIG_IPV6) void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds6_info_rdma_connection *iinfo6) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo6->rdma_mr_max = pool_1m->max_items; iinfo6->rdma_mr_size = pool_1m->max_pages; } #endif struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); ret = llist_del_first(&pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); else rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); } return ibmr; } void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->odp) return; switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; case DMA_TO_DEVICE: ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; } } void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->sg_dma_len) { ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, DMA_BIDIRECTIONAL); ibmr->sg_dma_len = 0; } /* Release the s/g list */ if (ibmr->sg_len) { unsigned int i; for (i = 0; i < ibmr->sg_len; ++i) { struct page *page = sg_page(&ibmr->sg[i]); /* FIXME we need a way to tell a r/w MR * from a r/o MR */ WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } kfree(ibmr->sg); ibmr->sg = NULL; ibmr->sg_len = 0; } } void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (pinned) { struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } } static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) { unsigned int item_count; item_count = atomic_read(&pool->item_count); if (free_all) return item_count; return 0; } /* * given an llist of mrs, put them all into the list_head for more processing */ static unsigned int llist_append_to_list(struct llist_head *llist, struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; unsigned int count = 0; node = llist_del_all(llist); while (node) { next = node->next; ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; count++; } return count; } /* * this takes a list head of mrs and turns it into linked llist nodes * of clusters. Each cluster has linked llist nodes of * MR_CLUSTER_SIZE mrs that are ready for reuse. */ static void list_to_llist_nodes(struct list_head *list, struct llist_node **nodes_head, struct llist_node **nodes_tail) { struct rds_ib_mr *ibmr; struct llist_node *cur = NULL; struct llist_node **next = nodes_head; list_for_each_entry(ibmr, list, unmap_list) { cur = &ibmr->llnode; *next = cur; next = &cur->next; } *next = NULL; *nodes_tail = cur; } /* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } prepare_to_wait(&pool->flush_wait, &wait, TASK_UNINTERRUPTIBLE); if (llist_empty(&pool->clean_list)) schedule(); ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } } finish_wait(&pool->flush_wait, &wait); } else mutex_lock(&pool->flush_lock); if (ibmr_ret) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; } } /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) { unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); llist_append_to_list(&pool->clean_list, &unmap_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } free_goal = rds_ib_flush_goal(pool, free_all); if (list_empty(&unmap_list)) goto out; rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { unsigned long flags; list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); clean_nodes = clean_nodes->next; } /* more than one entry in llist nodes */ if (clean_nodes) { spin_lock_irqsave(&pool->clean_lock, flags); llist_add_batch(clean_nodes, clean_tail, &pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: mutex_unlock(&pool->flush_lock); if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: return 0; } struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; int iter = 0; while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) return ibmr; if (atomic_inc_return(&pool->item_count) <= pool->max_items) break; atomic_dec(&pool->item_count); if (++iter > 2) { if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); break; } /* We do have some empty MRs. Flush them out. */ if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; } return NULL; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); if (ibmr->odp) { /* A MR created and marked as use_once. We use delayed work, * because there is a change that we are in interrupt and can't * call to ib_dereg_mr() directly. */ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); return; } /* Return it to the pool's free list */ rds_ib_free_frmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { if (rds_ibdev->mr_8k_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); if (rds_ibdev->mr_1m_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } u32 rds_ib_get_lkey(void *trans_private) { struct rds_ib_mr *ibmr = trans_private; return ibmr->u.mr->lkey; } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; struct rds_ib_connection *ic = NULL; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); if (!rds_ibdev) { ret = -ENODEV; goto out; } if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; int access_flags = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_ON_DEMAND); struct ib_sge sge = {}; struct ib_mr *ib_mr; if (!rds_ibdev->odp_capable) { ret = -EOPNOTSUPP; goto out; } ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, access_flags); if (IS_ERR(ib_mr)) { rdsdebug("rds_ib_get_user_mr returned %d\n", IS_ERR(ib_mr)); ret = PTR_ERR(ib_mr); goto out; } if (key_ret) *key_ret = ib_mr->rkey; ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); if (!ibmr) { ib_dereg_mr(ib_mr); ret = -ENOMEM; goto out; } ibmr->u.mr = ib_mr; ibmr->odp = 1; sge.addr = virt_addr; sge.length = length; sge.lkey = ib_mr->lkey; ib_advise_mr(rds_ibdev->pd, IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); return ibmr; } if (conn) ic = conn->c_transport_data; if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); if (IS_ERR(ibmr)) { ret = PTR_ERR(ibmr); pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); } else { return ibmr; } out: if (rds_ibdev) rds_ib_dev_put(rds_ibdev); return ERR_PTR(ret); } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { cancel_delayed_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int pool_type) { struct rds_ib_mr_pool *pool; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return ERR_PTR(-ENOMEM); pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); spin_lock_init(&pool->clean_lock); mutex_init(&pool->flush_lock); init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->max_pages / 4; pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; return pool; } int rds_ib_mr_init(void) { rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; } /* By the time this is called all the IB devices should have been torn down and * had their pools freed. As each pool is freed its work struct is waited on, * so the pool flushing work queue should be idle by the time we get here. */ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } static void rds_ib_odp_mr_worker(struct work_struct *work) { struct rds_ib_mr *ibmr; ibmr = container_of(work, struct rds_ib_mr, work.work); ib_dereg_mr(ibmr->u.mr); kfree(ibmr); } |
1 15 5 15 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor policy definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #ifndef __AA_NAMESPACE_H #define __AA_NAMESPACE_H #include <linux/kref.h> #include "apparmor.h" #include "apparmorfs.h" #include "label.h" #include "policy.h" /* struct aa_ns_acct - accounting of profiles in namespace * @max_size: maximum space allowed for all profiles in namespace * @max_count: maximum number of profiles that can be in this namespace * @size: current size of profiles * @count: current count of profiles (includes null profiles) */ struct aa_ns_acct { int max_size; int max_count; int size; int count; }; /* struct aa_ns - namespace for a set of profiles * @base: common policy * @parent: parent of namespace * @lock: lock for modifying the object * @acct: accounting for the namespace * @unconfined: special unconfined profile for the namespace * @sub_ns: list of namespaces under the current namespace. * @uniq_null: uniq value used for null learning profiles * @uniq_id: a unique id count for the profiles in the namespace * @level: level of ns within the tree hierarchy * @dents: dentries for the namespaces file entries in apparmorfs * * An aa_ns defines the set profiles that are searched to determine which * profile to attach to a task. Profiles can not be shared between aa_ns * and profile names within a namespace are guaranteed to be unique. When * profiles in separate namespaces have the same name they are NOT considered * to be equivalent. * * Namespaces are hierarchical and only namespaces and profiles below the * current namespace are visible. * * Namespace names must be unique and can not contain the characters :/\0 */ struct aa_ns { struct aa_policy base; struct aa_ns *parent; struct mutex lock; struct aa_ns_acct acct; struct aa_profile *unconfined; struct list_head sub_ns; atomic_t uniq_null; long uniq_id; int level; long revision; wait_queue_head_t wait; struct aa_labelset labels; struct list_head rawdata_list; struct dentry *dents[AAFS_NS_SIZEOF]; }; extern struct aa_label *kernel_t; extern struct aa_ns *root_ns; extern const char *aa_hidden_ns_name; #define ns_unconfined(NS) (&(NS)->unconfined->label) bool aa_ns_visible(struct aa_ns *curr, struct aa_ns *view, bool subns); const char *aa_ns_name(struct aa_ns *parent, struct aa_ns *child, bool subns); void aa_free_ns(struct aa_ns *ns); int aa_alloc_root_ns(void); void aa_free_root_ns(void); struct aa_ns *__aa_lookupn_ns(struct aa_ns *view, const char *hname, size_t n); struct aa_ns *aa_lookupn_ns(struct aa_ns *view, const char *name, size_t n); struct aa_ns *__aa_find_or_create_ns(struct aa_ns *parent, const char *name, struct dentry *dir); struct aa_ns *aa_prepare_ns(struct aa_ns *root, const char *name); void __aa_remove_ns(struct aa_ns *ns); static inline struct aa_profile *aa_deref_parent(struct aa_profile *p) { return rcu_dereference_protected(p->parent, mutex_is_locked(&p->ns->lock)); } /** * aa_get_ns - increment references count on @ns * @ns: namespace to increment reference count of (MAYBE NULL) * * Returns: pointer to @ns, if @ns is NULL returns NULL * Requires: @ns must be held with valid refcount when called */ static inline struct aa_ns *aa_get_ns(struct aa_ns *ns) { if (ns) aa_get_profile(ns->unconfined); return ns; } /** * aa_put_ns - decrement refcount on @ns * @ns: namespace to put reference of * * Decrement reference count of @ns and if no longer in use free it */ static inline void aa_put_ns(struct aa_ns *ns) { if (ns) aa_put_profile(ns->unconfined); } /** * __aa_findn_ns - find a namespace on a list by @name * @head: list to search for namespace on (NOT NULL) * @name: name of namespace to look for (NOT NULL) * @n: length of @name * Returns: unrefcounted namespace * * Requires: rcu_read_lock be held */ static inline struct aa_ns *__aa_findn_ns(struct list_head *head, const char *name, size_t n) { return (struct aa_ns *)__policy_strn_find(head, name, n); } static inline struct aa_ns *__aa_find_ns(struct list_head *head, const char *name) { return __aa_findn_ns(head, name, strlen(name)); } #endif /* AA_NAMESPACE_H */ |
91 25 110 12 232 302 303 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Connection state tracking for netfilter. This is separated from, * but required by, the (future) NAT layer; it can also be used by an iptables * extension. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack.h */ #ifndef _NF_CONNTRACK_H #define _NF_CONNTRACK_H #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_dccp.h> #include <linux/netfilter/nf_conntrack_sctp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <net/netfilter/nf_conntrack_tuple.h> struct nf_ct_udp { unsigned long stream_ts; }; /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ struct nf_ct_dccp dccp; struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; struct nf_ct_gre gre; unsigned int tmpl_padto; }; union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; struct nf_conntrack_net_ecache { struct delayed_work dwork; spinlock_t dying_lock; struct hlist_nulls_head dying_list; }; struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; unsigned int expect_count; /* only used from work queues, configuration plane, and so on: */ unsigned int users4; unsigned int users6; unsigned int users_bridge; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_net_ecache ecache; #endif }; #include <linux/types.h> #include <linux/skbuff.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> struct nf_conn { /* Usage count in here is 1 for hash table, 1 per skb, * plus 1 for any connection(s) we are `master' for * * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, * except that the latter uses internal indirection and does not * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; spinlock_t lock; /* jiffies32 when this ct is considered dead */ u32 timeout; #ifdef CONFIG_NF_CONNTRACK_ZONES struct nf_conntrack_zone zone; #endif /* XXX should I move this to the tail ? - Y.K */ /* These are my tuples; original and reply */ struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ struct { } __nfct_init_offset; /* If we were expected by an expectation, this will be it */ struct nf_conn *master; #if defined(CONFIG_NF_CONNTRACK_MARK) u_int32_t mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK u_int32_t secmark; #endif /* Extensions */ struct nf_ct_ext *ext; /* Storage reserved for other modules, must be the last member */ union nf_conntrack_proto proto; }; static inline struct nf_conn * nf_ct_to_nf_conn(const struct nf_conntrack *nfct) { return container_of(nfct, struct nf_conn, ct_general); } static inline struct nf_conn * nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) { return container_of(hash, struct nf_conn, tuplehash[hash->tuple.dst.dir]); } static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; } static inline u_int8_t nf_ct_protonum(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; } #define nf_ct_tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) /* get master conntrack via master expectation */ #define master_ct(conntr) (conntr->master) extern struct net init_net; static inline struct net *nf_ct_net(const struct nf_conn *ct) { return read_pnet(&ct->ct_net); } /* Is this tuple taken? (ignoring any belonging to the given conntrack). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack); /* Return conntrack_info and tuple hash for given skb. */ static inline struct nf_conn * nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) { unsigned long nfct = skb_get_nfct(skb); *ctinfo = nfct & NFCT_INFOMASK; return (struct nf_conn *)(nfct & NFCT_PTRMASK); } void nf_ct_destroy(struct nf_conntrack *nfct); void nf_conntrack_tcp_set_closing(struct nf_conn *ct); /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { if (ct && refcount_dec_and_test(&ct->ct_general.use)) nf_ct_destroy(&ct->ct_general); } /* load module; enable/disable conntrack in this namespace */ int nf_ct_netns_get(struct net *net, u8 nfproto); void nf_ct_netns_put(struct net *net, u8 nfproto); /* * Allocate a hashtable of hlist_head (if nulls == 0), * or hlist_nulls_head (if nulls == 1) */ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); int nf_conntrack_hash_check_insert(struct nf_conn *ct); bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple); void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies, bool do_acct); /* Refresh conntrack for this many jiffies and do accounting */ static inline void nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true); } /* Refresh conntrack for this many jiffies */ static inline void nf_ct_refresh(struct nf_conn *ct, const struct sk_buff *skb, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false); } /* kill conntrack and do accounting */ bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb); /* kill conntrack without accounting */ static inline bool nf_ct_kill(struct nf_conn *ct) { return nf_ct_delete(ct, 0, 0); } struct nf_ct_iter_data { struct net *net; void *data; u32 portid; int report; }; /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data); /* also set unconfirmed conntracks as dying. Only use in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data); struct nf_conntrack_zone; void nf_conntrack_free(struct nf_conn *ct); struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp); static inline int nf_ct_is_template(const struct nf_conn *ct) { return test_bit(IPS_TEMPLATE_BIT, &ct->status); } /* It's confirmed if it is, or has been in the hash table. */ static inline int nf_ct_is_confirmed(const struct nf_conn *ct) { return test_bit(IPS_CONFIRMED_BIT, &ct->status); } static inline int nf_ct_is_dying(const struct nf_conn *ct) { return test_bit(IPS_DYING_BIT, &ct->status); } /* Packet is received from loopback */ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) { return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } static inline void nf_conntrack_alter_reply(struct nf_conn *ct, const struct nf_conntrack_tuple *newreply) { /* Must be unconfirmed, so not in hash table yet */ if (WARN_ON(nf_ct_is_confirmed(ct))) return; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; } #define nfct_time_stamp ((u32)(jiffies)) /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; return max(timeout, 0); } static inline bool nf_ct_is_expired(const struct nf_conn *ct) { return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0; } /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) /* Set an arbitrary timeout large enough not to ever expire, this save * us a check for the IPS_OFFLOAD_BIT from the packet path via * nf_ct_is_expired(). */ static inline void nf_ct_offload_timeout(struct nf_conn *ct) { if (nf_ct_expires(ct) < NF_CT_DAY / 2) WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY); } struct kernel_param; int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; extern seqcount_spinlock_t nf_conntrack_generation; extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ static inline void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) { struct hlist_nulls_head *hptr; unsigned int sequence, hsz; do { sequence = read_seqcount_begin(&nf_conntrack_generation); hsz = nf_conntrack_htable_size; hptr = nf_conntrack_hash; } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); *hash = hptr; *hsize = hsz; } struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags); void nf_ct_tmpl_free(struct nf_conn *tmpl); u32 nf_ct_get_id(const struct nf_conn *ct); u32 nf_conntrack_count(const struct net *net); static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) { skb_set_nfct(skb, (unsigned long)ct | info); } extern unsigned int nf_conntrack_net_id; static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) { return net_generic(net, nf_conntrack_net_id); } int nf_ct_skb_network_trim(struct sk_buff *skb, int family); int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, u16 zone, u8 family, u8 *proto, u16 *mru); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) #define MODULE_ALIAS_NFCT_HELPER(helper) \ MODULE_ALIAS("nfct-helper-" helper) #endif /* _NF_CONNTRACK_H */ |
877 877 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include "ipvlan.h" #include <linux/if_vlan.h> #include <linux/if_tap.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static dev_t ipvtap_major; static struct cdev ipvtap_cdev; static const void *ipvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class ipvtap_class = { .name = "ipvtap", .ns_type = &net_ns_type_operations, .namespace = ipvtap_net_namespace, }; struct ipvtap_dev { struct ipvl_dev vlan; struct tap_dev tap; }; static void ipvtap_count_tx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_drps); } static void ipvtap_count_rx_dropped(struct tap_dev *tap) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; ipvlan_count_rx(vlan, 0, 0, 0); } static void ipvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap); struct ipvl_dev *vlan = &vlantap->vlan; vlan->sfeatures = features; netdev_update_features(vlan->dev); } static int ipvtap_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ipvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; vlantap->tap.count_tx_dropped = ipvtap_count_tx_dropped; vlantap->tap.update_features = ipvtap_update_features; vlantap->tap.count_rx_dropped = ipvtap_count_rx_dropped; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = ipvlan_link_new(src_net, dev, tb, data, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return err; } static void ipvtap_dellink(struct net_device *dev, struct list_head *head) { struct ipvtap_dev *vlan = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlan->tap); ipvlan_link_delete(dev, head); } static void ipvtap_setup(struct net_device *dev) { ipvlan_link_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; dev->priv_flags &= ~IFF_NO_QUEUE; } static struct rtnl_link_ops ipvtap_link_ops __read_mostly = { .kind = "ipvtap", .setup = ipvtap_setup, .newlink = ipvtap_newlink, .dellink = ipvtap_dellink, .priv_size = sizeof(struct ipvtap_dev), }; static int ipvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &ipvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(ipvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); classdev = device_create(&ipvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(ipvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor); device_destroy(&ipvtap_class, devt); tap_free_minor(ipvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block ipvtap_notifier_block __read_mostly = { .notifier_call = ipvtap_device_event, }; static int __init ipvtap_init(void) { int err; err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap", THIS_MODULE); if (err) goto out1; err = class_register(&ipvtap_class); if (err) goto out2; err = register_netdevice_notifier(&ipvtap_notifier_block); if (err) goto out3; err = ipvlan_link_register(&ipvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&ipvtap_notifier_block); out3: class_unregister(&ipvtap_class); out2: tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); out1: return err; } module_init(ipvtap_init); static void __exit ipvtap_exit(void) { rtnl_link_unregister(&ipvtap_link_ops); unregister_netdevice_notifier(&ipvtap_notifier_block); class_unregister(&ipvtap_class); tap_destroy_cdev(ipvtap_major, &ipvtap_cdev); } module_exit(ipvtap_exit); MODULE_ALIAS_RTNL_LINK("ipvtap"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_DESCRIPTION("IP-VLAN based tap driver"); MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 64 65 1 66 65 64 66 50 50 16 49 50 50 50 50 50 49 47 1 1 1 1 65 65 15 14 14 254 254 66 1 65 66 65 66 65 65 66 66 66 66 13 5 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 | // SPDX-License-Identifier: GPL-2.0-or-later #include <net/gro.h> #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> #include <linux/skbuff_ref.h> #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); /** * dev_add_offload - register offload handlers * @po: protocol offload declaration * * Add protocol offload handlers to the networking stack. The passed * &proto_offload is linked into kernel lists and may not be freed until * it has been removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new offload handlers (until the next received packet). */ void dev_add_offload(struct packet_offload *po) { struct packet_offload *elem; spin_lock(&offload_lock); list_for_each_entry(elem, &net_hotdata.offload_base, list) { if (po->priority < elem->priority) break; } list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); /** * __dev_remove_offload - remove offload handler * @po: packet offload declaration * * Remove a protocol offload handler that was previously added to the * kernel offload handlers by dev_add_offload(). The passed &offload_type * is removed from the kernel lists and can be freed or reused once this * function returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *po1; spin_lock(&offload_lock); list_for_each_entry(po1, head, list) { if (po == po1) { list_del_rcu(&po->list); goto out; } } pr_warn("dev_remove_offload: %p not found\n", po); out: spin_unlock(&offload_lock); } /** * dev_remove_offload - remove packet offload handler * @po: packet offload declaration * * Remove a packet offload handler that was previously added to the kernel * offload handlers by dev_add_offload(). The passed &offload_type is * removed from the kernel lists and can be freed or reused once this * function returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_offload(struct packet_offload *po) { __dev_remove_offload(po); synchronize_net(); } EXPORT_SYMBOL(dev_remove_offload); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; unsigned int new_truesize; struct sk_buff *lp; int segs; /* Do not splice page pool based packets w/ non-page pool * packets. This can result in reference count issues as page * pool pages will not decrement the reference count and will * instead be immediately returned to the pool or have frag * count decremented. */ if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || (p->protocol == htons(ETH_P_IPV6) && skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } segs = NAPI_GRO_CB(skb)->count; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; if (nr_frags > MAX_SKB_FRAGS) goto merge; offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; frag = pinfo->frags + nr_frags; frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i); skb_frag_off_add(frag, offset); skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; goto done; } else if (skb->head_frag) { int nr_frags = pinfo->nr_frags; skb_frag_t *frag = pinfo->frags + nr_frags; struct page *page = virt_to_head_page(skb->head); unsigned int first_size = headlen - offset; unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + offset; pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; skb_frag_fill_page_desc(frag, page, first_offset, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; skb->sk = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; skb_frag_off_add(&skbinfo->frags[0], eat); skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; } __skb_pull(skb, offset); if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; __skb_header_release(skb); lp = p; done: NAPI_GRO_CB(p)->count += segs; p->data_len += len; p->truesize += delta_truesize; p->len += len; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) { if (unlikely(p->len + skb->len >= 65536)) return -E2BIG; if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; skb_pull(skb, skb_gro_offset(skb)); NAPI_GRO_CB(p)->last = skb; NAPI_GRO_CB(p)->count++; p->data_len += skb->len; /* sk ownership - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; skb->sk = NULL; p->truesize += skb->truesize; p->len += skb->len; NAPI_GRO_CB(skb)->same_flow = 1; return 0; } static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); if (NAPI_GRO_CB(skb)->count == 1) { skb_shinfo(skb)->gso_size = 0; goto out; } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, ipv6_gro_complete, inet_gro_complete, skb, 0); break; } rcu_read_unlock(); if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); return; } out: gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, bool flush_old) { struct list_head *head = &napi->gro_hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); napi_gro_complete(napi, skb); napi->gro_hash[index].count--; } if (!napi->gro_hash[index].count) __clear_bit(index, &napi->gro_bitmask); } /* napi->gro_hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { unsigned long bitmask = napi->gro_bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, const struct sk_buff *p, unsigned long diffs) { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) struct tc_skb_ext *skb_ext; struct tc_skb_ext *p_ext; skb_ext = skb_ext_find(skb, TC_SKB_EXT); p_ext = skb_ext_find(p, TC_SKB_EXT); diffs |= (!!p_ext) ^ (!!skb_ext); if (!diffs && unlikely(skb_ext)) diffs |= p_ext->chain ^ skb_ext->chain; #endif return diffs; } static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); struct sk_buff *p; list_for_each_entry(p, head, list) { unsigned long diffs; if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_all ^ skb->vlan_all; diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), skb_mac_header(skb), maclen); /* in most common scenarios 'slow_gro' is 0 * otherwise we are already on some slower paths * either skip all the infrequent tests altogether or * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; } } static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo; const skb_frag_t *frag0; unsigned int headlen; NAPI_GRO_CB(skb)->network_offset = 0; NAPI_GRO_CB(skb)->data_offset = 0; headlen = skb_headlen(skb); NAPI_GRO_CB(skb)->frag0 = skb->data; NAPI_GRO_CB(skb)->frag0_len = headlen; if (headlen) return; pinfo = skb_shinfo(skb); frag0 = &pinfo->frags[0]; if (pinfo->nr_frags && skb_frag_page(frag0) && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), skb->end - skb->tail); } } static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); BUG_ON(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); skb->data_len -= grow; skb->tail += grow; skb_frag_off_add(&pinfo->frags[0], grow); skb_frag_size_sub(&pinfo->frags[0], grow); if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { skb_frag_unref(skb, 0); memmove(pinfo->frags, pinfo->frags + 1, --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } static void gro_try_pull_from_frag0(struct sk_buff *skb) { int grow = skb_gro_offset(skb) - skb_headlen(skb); if (grow > 0) gro_pull_from_frag0(skb, grow); } static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; oldest = list_last_entry(head, struct sk_buff, list); /* We are called with head length >= MAX_GRO_SKBS, so this is * impossible. */ if (WARN_ON_ONCE(!oldest)) return; /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ skb_list_del_init(oldest); napi_gro_complete(napi, oldest); } static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; if (netif_elide_gro(skb->dev)) goto normal; gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type == type && ptype->callbacks.gro_receive) goto found_ptype; } rcu_read_unlock(); goto normal; found_ptype: skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; /* Only support TCP and non DODGY users. */ if (!skb_is_gso_tcp(skb) || (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) NAPI_GRO_CB(skb)->flush = 1; } /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; break; } pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, &gro_list->list, skb); rcu_read_unlock(); if (PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; goto ok; } same_flow = NAPI_GRO_CB(skb)->same_flow; ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); gro_list->count--; } if (same_flow) goto ok; if (NAPI_GRO_CB(skb)->flush) goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) gro_flush_oldest(napi, &gro_list->list); else gro_list->count++; /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ gro_try_pull_from_frag0(skb); NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; if (!skb_is_gso(skb)) skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; ok: if (gro_list->count) { if (!test_bit(bucket, &napi->gro_bitmask)) __set_bit(bucket, &napi->gro_bitmask); } else if (test_bit(bucket, &napi->gro_bitmask)) { __clear_bit(bucket, &napi->gro_bitmask); } return ret; normal: ret = GRO_NORMAL; gro_try_pull_from_frag0(skb); goto ok; } struct packet_offload *gro_find_receive_by_type(__be16 type) { struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { struct list_head *offload_head = &net_hotdata.offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_complete_by_type); static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, SKB_CONSUMED); break; case GRO_HELD: case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { gro_result_t ret; skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; /* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); nf_reset_ct(skb); skb->slow_gro = 0; } napi->skb = skb; } struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); if (skb) { napi->skb = skb; skb_mark_napi_id(skb, napi); } } return skb; } EXPORT_SYMBOL(napi_get_frags); static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else napi_reuse_skb(napi, skb); break; case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } /* Upper GRO stack assumes network header starts at gro_offset=0 * Drivers could call both napi_gro_frags() and napi_gro_receive() * We copy ethernet header into skb->data to have a common layout. */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; const struct ethhdr *eth; unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb, hlen); if (unlikely(!skb_gro_may_pull(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", __func__, napi->dev->name); napi_reuse_skb(napi, skb); return NULL; } } else { eth = (const struct ethhdr *)skb->data; if (NAPI_GRO_CB(skb)->frag0 != skb->data) gro_pull_from_frag0(skb, hlen); NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require * special handling. * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; return skb; } gro_result_t napi_gro_frags(struct napi_struct *napi) { gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_frags_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_frags); /* Compute the checksum from gro_offset and return the folded value * after adding in any pseudo checksum. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) { __wsum wsum; __sum16 sum; wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; NAPI_GRO_CB(skb)->csum_valid = 1; return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2020 Intel Corporation. */ #ifndef XSK_BUFF_POOL_H_ #define XSK_BUFF_POOL_H_ #include <linux/if_xdp.h> #include <linux/types.h> #include <linux/dma-mapping.h> #include <linux/bpf.h> #include <net/xdp.h> struct xsk_buff_pool; struct xdp_rxq_info; struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; struct xdp_sock; struct device; struct page; #define XSK_PRIV_MAX 24 struct xdp_buff_xsk { struct xdp_buff xdp; u8 cb[XSK_PRIV_MAX]; dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; struct list_head list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) #define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; struct device *dev; struct net_device *netdev; refcount_t users; struct list_head list; /* Protected by the RTNL_LOCK */ u32 dma_pages_cnt; }; struct xsk_buff_pool { /* Members only used in the control path first. */ struct device *dev; struct net_device *netdev; struct list_head xsk_tx_list; /* Protects modifications to the xsk_tx_list */ spinlock_t xsk_tx_list_lock; refcount_t users; struct xdp_umem *umem; struct work_struct work; struct list_head free_list; struct list_head xskb_list; u32 heads_cnt; u16 queue_id; /* Data path members as close to free_heads at the end as possible. */ struct xsk_queue *fq ____cacheline_aligned_in_smp; struct xsk_queue *cq; /* For performance reasons, each buff pool has its own array of dma_pages * even when they are identical. */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; struct xdp_desc *tx_descs; u64 chunk_mask; u64 addrs_cnt; u32 free_list_cnt; u32 dma_pages_cnt; u32 free_heads_cnt; u32 headroom; u32 chunk_size; u32 chunk_shift; u32 frame_len; u32 xdp_zc_max_segs; u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; bool unaligned; bool tx_sw_csum; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when * sockets share a single cq when the same netdev and queue id is shared. */ spinlock_t cq_lock; struct xdp_buff_xsk *free_heads[]; }; /* Masks for xdp_umem_page flags. * The low 12-bits of the addr will be 0 since this is the page address, so we * can use them for flags. */ #define XSK_NEXT_PG_CONTIG_SHIFT 0 #define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) /* AF_XDP core. */ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id); int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_destroy(struct xsk_buff_pool *pool); void xp_get_pool(struct xsk_buff_pool *pool); bool xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, dma_addr_t *dma_pages, u64 addr) { xskb->frame_dma = (dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); xskb->dma = xskb->frame_dma + pool->headroom + XDP_PACKET_HEADROOM; } /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; } static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) { return xskb->frame_dma; } static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { dma_sync_single_for_cpu(xskb->pool->dev, xskb->dma, xskb->pool->frame_len, DMA_BIDIRECTIONAL); } static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL); } /* Masks for xdp_umem_page flags. * The low 12-bits of the addr will be 0 since this is the page address, so we * can use them for flags. */ #define XSK_NEXT_PG_CONTIG_SHIFT 0 #define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr, u32 len) { bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; if (likely(!cross_pg)) return false; return pool->dma_pages && !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } static inline bool xp_mb_desc(struct xdp_desc *desc) { return desc->options & XDP_PKT_CONTD; } static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) { return addr & pool->chunk_mask; } static inline u64 xp_unaligned_extract_addr(u64 addr) { return addr & XSK_UNALIGNED_BUF_ADDR_MASK; } static inline u64 xp_unaligned_extract_offset(u64 addr) { return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; } static inline u64 xp_unaligned_add_offset_to_addr(u64 addr) { return xp_unaligned_extract_addr(addr) + xp_unaligned_extract_offset(addr); } static inline u32 xp_aligned_extract_idx(struct xsk_buff_pool *pool, u64 addr) { return xp_aligned_extract_addr(pool, addr) >> pool->chunk_shift; } static inline void xp_release(struct xdp_buff_xsk *xskb) { if (xskb->pool->unaligned) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool) { u64 orig_addr = xskb->xdp.data - pool->addrs; u64 offset; if (!pool->unaligned) return orig_addr; offset = xskb->xdp.data - xskb->xdp.data_hard_start; orig_addr -= offset; offset += pool->headroom; return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) { return pool->tx_metadata_len > 0; } #endif /* XSK_BUFF_POOL_H_ */ |
2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | // SPDX-License-Identifier: GPL-2.0 /* * Block stat tracking code * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/rculist.h> #include "blk-stat.h" #include "blk-mq.h" #include "blk.h" struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; stat->batch = 0; } /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); dst->max = max(dst->max, src->max); dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, dst->nr_samples + src->nr_samples); dst->nr_samples += src->nr_samples; } void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); stat->batch += value; stat->nr_samples++; } void blk_stat_add(struct request *rq, u64 now) { struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; rcu_read_lock(); cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; bucket = cb->bucket_fn(rq); if (bucket < 0) continue; stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); } put_cpu(); rcu_read_unlock(); } static void blk_stat_timer_fn(struct timer_list *t) { struct blk_stat_callback *cb = from_timer(cb, t, timer); unsigned int bucket; int cpu; for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cb->stat[bucket]); for_each_online_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) { blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); blk_rq_stat_init(&cpu_stat[bucket]); } } cb->timer_fn(cb); } struct blk_stat_callback * blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), int (*bucket_fn)(const struct request *), unsigned int buckets, void *data) { struct blk_stat_callback *cb; cb = kmalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return NULL; cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), GFP_KERNEL); if (!cb->stat) { kfree(cb); return NULL; } cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), __alignof__(struct blk_rq_stat)); if (!cb->cpu_stat) { kfree(cb->stat); kfree(cb); return NULL; } cb->timer_fn = timer_fn; cb->bucket_fn = bucket_fn; cb->data = data; cb->buckets = buckets; timer_setup(&cb->timer, blk_stat_timer_fn, 0); return cb; } void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned int bucket; unsigned long flags; int cpu; for_each_possible_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cpu_stat[bucket]); } spin_lock_irqsave(&q->stats->lock, flags); list_add_tail_rcu(&cb->list, &q->stats->callbacks); blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); del_timer_sync(&cb->timer); } static void blk_stat_free_callback_rcu(struct rcu_head *head) { struct blk_stat_callback *cb; cb = container_of(head, struct blk_stat_callback, rcu); free_percpu(cb->cpu_stat); kfree(cb->stat); kfree(cb); } void blk_stat_free_callback(struct blk_stat_callback *cb) { if (cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } void blk_stat_disable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!--q->stats->accounting && list_empty(&q->stats->callbacks)) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); void blk_stat_enable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!q->stats->accounting++ && list_empty(&q->stats->callbacks)) blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); struct blk_queue_stats *blk_alloc_queue_stats(void) { struct blk_queue_stats *stats; stats = kmalloc(sizeof(*stats), GFP_KERNEL); if (!stats) return NULL; INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); stats->accounting = 0; return stats; } void blk_free_queue_stats(struct blk_queue_stats *stats) { if (!stats) return; WARN_ON(!list_empty(&stats->callbacks)); kfree(stats); } |
488 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | // SPDX-License-Identifier: GPL-2.0 /* * ACPI support * * Copyright (C) 2020, Intel Corporation * Author: Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/acpi.h> #include <linux/pm_runtime.h> #include "tb.h" static acpi_status tb_acpi_add_link(acpi_handle handle, u32 level, void *data, void **ret) { struct acpi_device *adev = acpi_fetch_acpi_dev(handle); struct fwnode_handle *fwnode; struct tb_nhi *nhi = data; struct pci_dev *pdev; struct device *dev; if (!adev) return AE_OK; fwnode = fwnode_find_reference(acpi_fwnode_handle(adev), "usb4-host-interface", 0); if (IS_ERR(fwnode)) return AE_OK; /* It needs to reference this NHI */ if (dev_fwnode(&nhi->pdev->dev) != fwnode) goto out_put; /* * Ignore USB3 ports here as USB core will set up device links between * tunneled USB3 devices and NHI host during USB device creation. * USB3 ports might not even have a physical device yet if xHCI driver * isn't bound yet. */ dev = acpi_get_first_physical_node(adev); if (!dev || !dev_is_pci(dev)) goto out_put; /* Check that this matches a PCIe root/downstream port. */ pdev = to_pci_dev(dev); if (pci_is_pcie(pdev) && (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) { const struct device_link *link; /* * Make them both active first to make sure the NHI does * not runtime suspend before the consumer. The * pm_runtime_put() below then allows the consumer to * runtime suspend again (which then allows NHI runtime * suspend too now that the device link is established). */ pm_runtime_get_sync(&pdev->dev); link = device_link_add(&pdev->dev, &nhi->pdev->dev, DL_FLAG_AUTOREMOVE_SUPPLIER | DL_FLAG_RPM_ACTIVE | DL_FLAG_PM_RUNTIME); if (link) { dev_dbg(&nhi->pdev->dev, "created link from %s\n", dev_name(&pdev->dev)); *(bool *)ret = true; } else { dev_warn(&nhi->pdev->dev, "device link creation from %s failed\n", dev_name(&pdev->dev)); } pm_runtime_put(&pdev->dev); } out_put: fwnode_handle_put(fwnode); return AE_OK; } /** * tb_acpi_add_links() - Add device links based on ACPI description * @nhi: Pointer to NHI * * Goes over ACPI namespace finding tunneled ports that reference to * @nhi ACPI node. For each reference a device link is added. The link * is automatically removed by the driver core. * * Returns %true if at least one link was created. */ bool tb_acpi_add_links(struct tb_nhi *nhi) { acpi_status status; bool ret = false; if (!has_acpi_companion(&nhi->pdev->dev)) return false; /* * Find all devices that have usb4-host-controller interface * property that references to this NHI. */ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, 32, tb_acpi_add_link, NULL, nhi, (void **)&ret); if (ACPI_FAILURE(status)) { dev_warn(&nhi->pdev->dev, "failed to enumerate tunneled ports\n"); return false; } return ret; } /** * tb_acpi_is_native() - Did the platform grant native TBT/USB4 control * * Returns %true if the platform granted OS native control over * TBT/USB4. In this case software based connection manager can be used, * otherwise there is firmware based connection manager running. */ bool tb_acpi_is_native(void) { return osc_sb_native_usb4_support_confirmed && osc_sb_native_usb4_control; } /** * tb_acpi_may_tunnel_usb3() - Is USB3 tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native USB3 tunneling. */ bool tb_acpi_may_tunnel_usb3(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_USB3_TUNNELING; return true; } /** * tb_acpi_may_tunnel_dp() - Is DisplayPort tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native DP tunneling. */ bool tb_acpi_may_tunnel_dp(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_DP_TUNNELING; return true; } /** * tb_acpi_may_tunnel_pcie() - Is PCIe tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native PCIe tunneling. */ bool tb_acpi_may_tunnel_pcie(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_PCIE_TUNNELING; return true; } /** * tb_acpi_is_xdomain_allowed() - Are XDomain connections allowed * * When software based connection manager is used, this function * returns %true if platform allows XDomain connections. */ bool tb_acpi_is_xdomain_allowed(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_XDOMAIN; return true; } /* UUID for retimer _DSM: e0053122-795b-4122-8a5e-57be1d26acb3 */ static const guid_t retimer_dsm_guid = GUID_INIT(0xe0053122, 0x795b, 0x4122, 0x8a, 0x5e, 0x57, 0xbe, 0x1d, 0x26, 0xac, 0xb3); #define RETIMER_DSM_QUERY_ONLINE_STATE 1 #define RETIMER_DSM_SET_ONLINE_STATE 2 static int tb_acpi_retimer_set_power(struct tb_port *port, bool power) { struct usb4_port *usb4 = port->usb4; union acpi_object argv4[2]; struct acpi_device *adev; union acpi_object *obj; int ret; if (!usb4->can_offline) return 0; adev = ACPI_COMPANION(&usb4->dev); if (WARN_ON(!adev)) return 0; /* Check if we are already powered on (and in correct mode) */ obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_QUERY_ONLINE_STATE, NULL, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: query online _DSM failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (power == ret) return 0; tb_port_dbg(port, "ACPI: calling _DSM to power %s retimers\n", power ? "on" : "off"); argv4[0].type = ACPI_TYPE_PACKAGE; argv4[0].package.count = 1; argv4[0].package.elements = &argv4[1]; argv4[1].integer.type = ACPI_TYPE_INTEGER; argv4[1].integer.value = power; obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_SET_ONLINE_STATE, argv4, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: set online state _DSM evaluation failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (ret >= 0) { if (power) return ret == 1 ? 0 : -EBUSY; return 0; } tb_port_warn(port, "ACPI: set online state _DSM failed with error %d\n", ret); return -EIO; } /** * tb_acpi_power_on_retimers() - Call platform to power on retimers * @port: USB4 port * * Calls platform to turn on power to all retimers behind this USB4 * port. After this function returns successfully the caller can * continue with the normal retimer flows (as specified in the USB4 * spec). Note if this returns %-EBUSY it means the type-C port is in * non-USB4/TBT mode (there is non-USB4/TBT device connected). * * This should only be called if the USB4/TBT link is not up. * * Returns %0 on success. */ int tb_acpi_power_on_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, true); } /** * tb_acpi_power_off_retimers() - Call platform to power off retimers * @port: USB4 port * * This is the opposite of tb_acpi_power_on_retimers(). After returning * successfully the normal operations with the @port can continue. * * Returns %0 on success. */ int tb_acpi_power_off_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, false); } static bool tb_acpi_bus_match(struct device *dev) { return tb_is_switch(dev) || tb_is_usb4_port_device(dev); } static struct acpi_device *tb_acpi_switch_find_companion(struct tb_switch *sw) { struct tb_switch *parent_sw = tb_switch_parent(sw); struct acpi_device *adev = NULL; /* * Device routers exists under the downstream facing USB4 port * of the parent router. Their _ADR is always 0. */ if (parent_sw) { struct tb_port *port = tb_switch_downstream_port(sw); struct acpi_device *port_adev; port_adev = acpi_find_child_by_adr(ACPI_COMPANION(&parent_sw->dev), port->port); if (port_adev) adev = acpi_find_child_device(port_adev, 0, false); } else { struct tb_nhi *nhi = sw->tb->nhi; struct acpi_device *parent_adev; parent_adev = ACPI_COMPANION(&nhi->pdev->dev); if (parent_adev) adev = acpi_find_child_device(parent_adev, 0, false); } return adev; } static struct acpi_device *tb_acpi_find_companion(struct device *dev) { /* * The Thunderbolt/USB4 hierarchy looks like following: * * Device (NHI) * Device (HR) // Host router _ADR == 0 * Device (DFP0) // Downstream port _ADR == lane 0 adapter * Device (DR) // Device router _ADR == 0 * Device (UFP) // Upstream port _ADR == lane 0 adapter * Device (DFP1) // Downstream port _ADR == lane 0 adapter number * * At the moment we bind the host router to the corresponding * Linux device. */ if (tb_is_switch(dev)) return tb_acpi_switch_find_companion(tb_to_switch(dev)); if (tb_is_usb4_port_device(dev)) return acpi_find_child_by_adr(ACPI_COMPANION(dev->parent), tb_to_usb4_port_device(dev)->port->port); return NULL; } static void tb_acpi_setup(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); struct usb4_port *usb4 = tb_to_usb4_port_device(dev); if (!adev || !usb4) return; if (acpi_check_dsm(adev->handle, &retimer_dsm_guid, 1, BIT(RETIMER_DSM_QUERY_ONLINE_STATE) | BIT(RETIMER_DSM_SET_ONLINE_STATE))) usb4->can_offline = true; } static struct acpi_bus_type tb_acpi_bus = { .name = "thunderbolt", .match = tb_acpi_bus_match, .find_companion = tb_acpi_find_companion, .setup = tb_acpi_setup, }; int tb_acpi_init(void) { return register_acpi_bus_type(&tb_acpi_bus); } void tb_acpi_exit(void) { unregister_acpi_bus_type(&tb_acpi_bus); } |
13 2 11 1 1 1 6 5 2 3 1 1 1 3 3 3 1 11 12 1 1 1 1 1 2 1 1 1 1 2 1 11 3 3 1 1 1 3 3 4 4 1 3 1 1 1 2 2 1 1 1 1 5 5 5 1 4 4 4 4 7 3 4 1 3 2 1 1 2 4 5 5 2 3 2 1 1 9 9 8 1 1 1 5 1 1 3 3 2 1 1 1 1 4 4 4 4 3 2 2 4 3 3 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/nfc.h> #include <linux/sched/signal.h> #include "nfc.h" #include "llcp.h" static int sock_wait_state(struct sock *sk, int state, unsigned long timeo) { DECLARE_WAITQUEUE(wait, current); int err = 0; pr_debug("sk %p", sk); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (sk->sk_state != state) { if (!timeo) { err = -EINPROGRESS; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } static struct proto llcp_sock_proto = { .name = "NFC_LLCP", .owner = THIS_MODULE, .obj_size = sizeof(struct nfc_llcp_sock), }; static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); /* This is going to be a listening socket, dsap must be 0 */ if (llcp_addr.dsap != 0) return -EINVAL; lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, llcp_addr.service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(llcp_addr.service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_put_local; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -EADDRINUSE; goto free_service_name; } llcp_sock->reserved_ssap = llcp_sock->ssap; nfc_llcp_sock_link(&local->sockets, sk); pr_debug("Socket bound to SAP %d\n", llcp_sock->ssap); sk->sk_state = LLCP_BOUND; nfc_put_device(dev); release_sock(sk); return 0; free_service_name: kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_put_local: nfc_llcp_local_put(llcp_sock->local); llcp_sock->local = NULL; llcp_sock->dev = NULL; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; nfc_llcp_sock_link(&local->raw_sockets, sk); sk->sk_state = LLCP_BOUND; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int ret = 0; pr_debug("sk %p backlog %d\n", sk, backlog); lock_sock(sk); if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM) || sk->sk_state != LLCP_BOUND) { ret = -EBADFD; goto error; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; pr_debug("Socket listening\n"); sk->sk_state = LLCP_LISTEN; error: release_sock(sk); return ret; } static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); u32 opt; int err = 0; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case NFC_LLCP_RW: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_RW) { err = -EINVAL; break; } llcp_sock->rw = (u8) opt; break; case NFC_LLCP_MIUX: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_MIUX) { err = -EINVAL; break; } llcp_sock->miux = cpu_to_be16((u16) opt); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); pr_debug("%p rw %d miux %d\n", llcp_sock, llcp_sock->rw, llcp_sock->miux); return err; } static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct nfc_llcp_local *local; struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int len, err = 0; u16 miux, remote_miu; u8 rw; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; local = llcp_sock->local; if (!local) return -ENODEV; len = min_t(u32, len, sizeof(u32)); lock_sock(sk); switch (optname) { case NFC_LLCP_RW: rw = llcp_sock->rw > LLCP_MAX_RW ? local->rw : llcp_sock->rw; if (put_user(rw, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_MIUX: miux = be16_to_cpu(llcp_sock->miux) > LLCP_MAX_MIUX ? be16_to_cpu(local->miux) : be16_to_cpu(llcp_sock->miux); if (put_user(miux, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_MIU: remote_miu = llcp_sock->remote_miu > LLCP_MAX_MIU ? local->remote_miu : llcp_sock->remote_miu; if (put_user(remote_miu, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_LTO: if (put_user(local->remote_lto / 10, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_RW: if (put_user(llcp_sock->remote_rw, (u32 __user *) optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); if (put_user(len, optlen)) return -EFAULT; return err; } void nfc_llcp_accept_unlink(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("state %d\n", sk->sk_state); list_del_init(&llcp_sock->accept_queue); sk_acceptq_removed(llcp_sock->parent); llcp_sock->parent = NULL; sock_put(sk); } void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_sock *llcp_sock_parent = nfc_llcp_sock(parent); /* Lock will be free from unlink */ sock_hold(sk); list_add_tail(&llcp_sock->accept_queue, &llcp_sock_parent->accept_queue); llcp_sock->parent = parent; sk_acceptq_added(parent); } struct sock *nfc_llcp_accept_dequeue(struct sock *parent, struct socket *newsock) { struct nfc_llcp_sock *lsk, *n, *llcp_parent; struct sock *sk; llcp_parent = nfc_llcp_sock(parent); list_for_each_entry_safe(lsk, n, &llcp_parent->accept_queue, accept_queue) { sk = &lsk->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_accept_unlink(sk); continue; } if (sk->sk_state == LLCP_CONNECTED || !newsock) { list_del_init(&lsk->accept_queue); sock_put(sk); if (newsock) sock_graft(sk, newsock); release_sock(sk); pr_debug("Returning sk state %d\n", sk->sk_state); sk_acceptq_removed(parent); return sk; } release_sock(sk); } return NULL; } static int llcp_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *new_sk; long timeo; int ret = 0; pr_debug("parent %p\n", sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_state != LLCP_LISTEN) { ret = -EBADFD; goto error; } timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection. */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(new_sk = nfc_llcp_accept_dequeue(sk, newsock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); if (ret) goto error; newsock->state = SS_CONNECTED; pr_debug("new socket %p\n", new_sk); error: release_sock(sk); return ret; } static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, llcp_addr, uaddr); if (llcp_sock == NULL || llcp_sock->dev == NULL) return -EBADFD; pr_debug("%p %d %d %d\n", sk, llcp_sock->target_idx, llcp_sock->dsap, llcp_sock->ssap); memset(llcp_addr, 0, sizeof(*llcp_addr)); lock_sock(sk); if (!llcp_sock->dev) { release_sock(sk); return -EBADFD; } llcp_addr->sa_family = AF_NFC; llcp_addr->dev_idx = llcp_sock->dev->idx; llcp_addr->target_idx = llcp_sock->target_idx; llcp_addr->nfc_protocol = llcp_sock->nfc_protocol; llcp_addr->dsap = llcp_sock->dsap; llcp_addr->ssap = llcp_sock->ssap; llcp_addr->service_name_len = llcp_sock->service_name_len; memcpy(llcp_addr->service_name, llcp_sock->service_name, llcp_addr->service_name_len); release_sock(sk); return sizeof(struct sockaddr_nfc_llcp); } static inline __poll_t llcp_accept_poll(struct sock *parent) { struct nfc_llcp_sock *llcp_sock, *parent_sock; struct sock *sk; parent_sock = nfc_llcp_sock(parent); list_for_each_entry(llcp_sock, &parent_sock->accept_queue, accept_queue) { sk = &llcp_sock->sk; if (sk->sk_state == LLCP_CONNECTED) return EPOLLIN | EPOLLRDNORM; } return 0; } static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); sock_poll_wait(file, sock, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == LLCP_CLOSED) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); return mask; } static int llcp_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct nfc_llcp_local *local; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int err = 0; if (!sk) return 0; pr_debug("%p\n", sk); local = llcp_sock->local; if (local == NULL) { err = -ENODEV; goto out; } lock_sock(sk); /* Send a DISC */ if (sk->sk_state == LLCP_CONNECTED) nfc_llcp_send_disconnect(llcp_sock); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; lock_sock(accept_sk); nfc_llcp_send_disconnect(lsk); nfc_llcp_accept_unlink(accept_sk); release_sock(accept_sk); } } if (sock->type == SOCK_RAW) nfc_llcp_sock_unlink(&local->raw_sockets, sk); else nfc_llcp_sock_unlink(&local->sockets, sk); if (llcp_sock->reserved_ssap < LLCP_SAP_MAX) nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap); release_sock(sk); out: sock_orphan(sk); sock_put(sk); return err; } static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, int len, int flags) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct sockaddr_nfc_llcp *addr = (struct sockaddr_nfc_llcp *)_addr; struct nfc_dev *dev; struct nfc_llcp_local *local; int ret = 0; pr_debug("sock %p sk %p flags 0x%x\n", sock, sk, flags); if (!addr || len < sizeof(*addr) || addr->sa_family != AF_NFC) return -EINVAL; if (addr->service_name_len == 0 && addr->dsap == 0) return -EINVAL; pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx, addr->target_idx, addr->nfc_protocol); lock_sock(sk); if (sk->sk_state == LLCP_CONNECTED) { ret = -EISCONN; goto error; } if (sk->sk_state == LLCP_CONNECTING) { ret = -EINPROGRESS; goto error; } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } device_lock(&dev->dev); if (dev->dep_link_up == false) { ret = -ENOLINK; device_unlock(&dev->dev); goto sock_llcp_put_local; } device_unlock(&dev->dev); if (local->rf_mode == NFC_RF_INITIATOR && addr->target_idx != local->target_idx) { ret = -ENOLINK; goto sock_llcp_put_local; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -ENOMEM; goto sock_llcp_nullify; } llcp_sock->reserved_ssap = llcp_sock->ssap; if (addr->service_name_len == 0) llcp_sock->dsap = addr->dsap; else llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->nfc_protocol = addr->nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, addr->service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(addr->service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_release; } nfc_llcp_sock_link(&local->connecting_sockets, sk); ret = nfc_llcp_send_connect(llcp_sock); if (ret) goto sock_unlink; sk->sk_state = LLCP_CONNECTING; ret = sock_wait_state(sk, LLCP_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); if (ret && ret != -EINPROGRESS) goto sock_unlink; release_sock(sk); return ret; sock_unlink: nfc_llcp_sock_unlink(&local->connecting_sockets, sk); kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); sock_llcp_nullify: llcp_sock->local = NULL; llcp_sock->dev = NULL; sock_llcp_put_local: nfc_llcp_local_put(local); put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int ret; pr_debug("sock %p sk %p", sock, sk); ret = sock_error(sk); if (ret) return ret; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); if (!llcp_sock->local) { release_sock(sk); return -ENODEV; } if (sk->sk_type == SOCK_DGRAM) { if (sk->sk_state != LLCP_BOUND) { release_sock(sk); return -ENOTCONN; } DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } release_sock(sk); return nfc_llcp_send_ui_frame(llcp_sock, addr->dsap, addr->ssap, msg, len); } if (sk->sk_state != LLCP_CONNECTED) { release_sock(sk); return -ENOTCONN; } release_sock(sk); return nfc_llcp_send_i_frame(llcp_sock, msg, len); } static int llcp_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; unsigned int copied, rlen; struct sk_buff *skb, *cskb; int err = 0; pr_debug("%p %zu\n", sk, len); lock_sock(sk); if (sk->sk_state == LLCP_CLOSED && skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); return 0; } release_sock(sk); if (flags & (MSG_OOB)) return -EOPNOTSUPP; skb = skb_recv_datagram(sk, flags, &err); if (!skb) { pr_err("Recv datagram failed state %d %d %d", sk->sk_state, err, sock_error(sk)); if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; return err; } rlen = skb->len; /* real length of skb */ copied = min_t(unsigned int, rlen, len); cskb = skb; if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; } sock_recv_timestamp(msg, sk, skb); if (sk->sk_type == SOCK_DGRAM && msg->msg_name) { struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, sockaddr, msg->msg_name); msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp); pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap); memset(sockaddr, 0, sizeof(*sockaddr)); sockaddr->sa_family = AF_NFC; sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP; sockaddr->dsap = ui_cb->dsap; sockaddr->ssap = ui_cb->ssap; } /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { /* SOCK_STREAM: re-queue skb if it contains unreceived data */ if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { skb_pull(skb, copied); if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); goto done; } } kfree_skb(skb); } /* XXX Queue backlogged skbs */ done: /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */ if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC)) copied = rlen; return copied; } static const struct proto_ops llcp_sock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_sock_bind, .connect = llcp_sock_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, .setsockopt = nfc_llcp_setsockopt, .getsockopt = nfc_llcp_getsockopt, .sendmsg = llcp_sock_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops llcp_rawsock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_raw_sock_bind, .connect = sock_no_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static void llcp_sock_destruct(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("%p\n", sk); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); skb_queue_purge(&sk->sk_receive_queue); nfc_llcp_sock_free(llcp_sock); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Freeing alive NFC LLCP socket %p\n", sk); return; } } struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; llcp_sock = nfc_llcp_sock(sk); sock_init_data(sock, sk); sk->sk_state = LLCP_CLOSED; sk->sk_protocol = NFC_SOCKPROTO_LLCP; sk->sk_type = type; sk->sk_destruct = llcp_sock_destruct; llcp_sock->ssap = 0; llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->rw = LLCP_MAX_RW + 1; llcp_sock->miux = cpu_to_be16(LLCP_MAX_MIUX + 1); llcp_sock->send_n = llcp_sock->send_ack_n = 0; llcp_sock->recv_n = llcp_sock->recv_ack_n = 0; llcp_sock->remote_ready = 1; llcp_sock->reserved_ssap = LLCP_SAP_MAX; nfc_llcp_socket_remote_param_init(llcp_sock); skb_queue_head_init(&llcp_sock->tx_queue); skb_queue_head_init(&llcp_sock->tx_pending_queue); INIT_LIST_HEAD(&llcp_sock->accept_queue); if (sock != NULL) sock->state = SS_UNCONNECTED; return sk; } void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) { kfree(sock->service_name); skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); list_del_init(&sock->accept_queue); sock->parent = NULL; nfc_llcp_local_put(sock->local); } static int llcp_sock_create(struct net *net, struct socket *sock, const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; pr_debug("%p\n", sock); if (sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (sock->type == SOCK_RAW) { if (!capable(CAP_NET_RAW)) return -EPERM; sock->ops = &llcp_rawsock_ops; } else { sock->ops = &llcp_sock_ops; } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; return 0; } static const struct nfc_protocol llcp_nfc_proto = { .id = NFC_SOCKPROTO_LLCP, .proto = &llcp_sock_proto, .owner = THIS_MODULE, .create = llcp_sock_create }; int __init nfc_llcp_sock_init(void) { return nfc_proto_register(&llcp_nfc_proto); } void nfc_llcp_sock_exit(void) { nfc_proto_unregister(&llcp_nfc_proto); } |
1797 1796 11 1793 75 76 1429 1436 1432 506 4182 203 208 49 176 150 128 128 166 167 695 122 5 122 5 500 5 488 107 170 23 701 701 258 27 38 225 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/xarray.h> /** * idr_alloc_u32() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @nextid: Pointer to an ID. * @max: The maximum ID to allocate (inclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @nextid and @max. * Note that @max is inclusive whereas the @end parameter to idr_alloc() * is exclusive. The new ID is assigned to @nextid before the pointer * is inserted into the IDR, so if @nextid points into the object pointed * to by @ptr, a concurrent lookup will not find an uninitialised ID. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. If an error occurred, * @nextid is unchanged. */ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, unsigned long max, gfp_t gfp) { struct radix_tree_iter iter; void __rcu **slot; unsigned int base = idr->idr_base; unsigned int id = *nextid; if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR))) idr->idr_rt.xa_flags |= IDR_RT_MARKER; id = (id < base) ? 0 : id - base; radix_tree_iter_init(&iter, id); slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base); if (IS_ERR(slot)) return PTR_ERR(slot); *nextid = iter.index + base; /* there is a memory barrier inside radix_tree_iter_replace() */ radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr); radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE); return 0; } EXPORT_SYMBOL_GPL(idr_alloc_u32); /** * idr_alloc() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = start; int ret; if (WARN_ON_ONCE(start < 0)) return -EINVAL; ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp); if (ret) return ret; return id; } EXPORT_SYMBOL_GPL(idr_alloc); /** * idr_alloc_cyclic() - Allocate an ID cyclically. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * The search for an unused ID will start at the last ID allocated and will * wrap around to @start if no free IDs are found before reaching @end. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = idr->idr_next; int err, max = end > 0 ? end - 1 : INT_MAX; if ((int)id < start) id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); if ((err == -ENOSPC) && (id > start)) { id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); } if (err) return err; idr->idr_next = id + 1; return id; } EXPORT_SYMBOL(idr_alloc_cyclic); /** * idr_remove() - Remove an ID from the IDR. * @idr: IDR handle. * @id: Pointer ID. * * Removes this ID from the IDR. If the ID was not previously in the IDR, * this function returns %NULL. * * Since this function modifies the IDR, the caller should provide their * own locking to ensure that concurrent modification of the same IDR is * not possible. * * Return: The pointer formerly associated with this ID. */ void *idr_remove(struct idr *idr, unsigned long id) { return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL); } EXPORT_SYMBOL_GPL(idr_remove); /** * idr_find() - Return pointer for given ID. * @idr: IDR handle. * @id: Pointer ID. * * Looks up the pointer associated with this ID. A %NULL pointer may * indicate that @id is not allocated or that the %NULL pointer was * associated with this ID. * * This function can be called under rcu_read_lock(), given that the leaf * pointers lifetimes are correctly managed. * * Return: The pointer associated with this ID. */ void *idr_find(const struct idr *idr, unsigned long id) { return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base); } EXPORT_SYMBOL_GPL(idr_find); /** * idr_for_each() - Iterate through all stored pointers. * @idr: IDR handle. * @fn: Function to be called for each pointer. * @data: Data passed to callback function. * * The callback function will be called for each entry in @idr, passing * the ID, the entry and @data. * * If @fn returns anything other than %0, the iteration stops and that * value is returned from this function. * * idr_for_each() can be called concurrently with idr_alloc() and * idr_remove() if protected by RCU. Newly added entries may not be * seen and deleted entries may be seen, but adding and removing entries * will not cause other entries to be skipped, nor spurious ones to be seen. */ int idr_for_each(const struct idr *idr, int (*fn)(int id, void *p, void *data), void *data) { struct radix_tree_iter iter; void __rcu **slot; int base = idr->idr_base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) { int ret; unsigned long id = iter.index + base; if (WARN_ON_ONCE(id > INT_MAX)) break; ret = fn(id, rcu_dereference_raw(*slot), data); if (ret) return ret; } return 0; } EXPORT_SYMBOL(idr_for_each); /** * idr_get_next_ul() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next_ul(struct idr *idr, unsigned long *nextid) { struct radix_tree_iter iter; void __rcu **slot; void *entry = NULL; unsigned long base = idr->idr_base; unsigned long id = *nextid; id = (id < base) ? 0 : id - base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) { entry = rcu_dereference_raw(*slot); if (!entry) continue; if (!xa_is_internal(entry)) break; if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry)) break; slot = radix_tree_iter_retry(&iter); } if (!slot) return NULL; *nextid = iter.index + base; return entry; } EXPORT_SYMBOL(idr_get_next_ul); /** * idr_get_next() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next(struct idr *idr, int *nextid) { unsigned long id = *nextid; void *entry = idr_get_next_ul(idr, &id); if (WARN_ON_ONCE(id > INT_MAX)) return NULL; *nextid = id; return entry; } EXPORT_SYMBOL(idr_get_next); /** * idr_replace() - replace pointer for given ID. * @idr: IDR handle. * @ptr: New pointer to associate with the ID. * @id: ID to change. * * Replace the pointer registered with an ID and return the old value. * This function can be called under the RCU read lock concurrently with * idr_alloc() and idr_remove() (as long as the ID being removed is not |