Total coverage: 252127 (17%)of 1547664
16 1 15 42 1 7 35 1 1 1 31 1 2 28 1 27 19 2 17 82 82 44 71 81 82 82 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 // SPDX-License-Identifier: GPL-2.0-or-later // // Validation of USB-audio class descriptors // #include <linux/init.h> #include <linux/usb.h> #include <linux/usb/audio.h> #include <linux/usb/audio-v2.h> #include <linux/usb/audio-v3.h> #include <linux/usb/midi.h> #include "usbaudio.h" #include "helper.h" struct usb_desc_validator { unsigned char protocol; unsigned char type; bool (*func)(const void *p, const struct usb_desc_validator *v); size_t size; }; #define UAC_VERSION_ALL (unsigned char)(-1) /* UAC1 only */ static bool validate_uac1_header(const void *p, const struct usb_desc_validator *v) { const struct uac1_ac_header_descriptor *d = p; return d->bLength >= sizeof(*d) && d->bLength >= sizeof(*d) + d->bInCollection; } /* for mixer unit; covering all UACs */ static bool validate_mixer_unit(const void *p, const struct usb_desc_validator *v) { const struct uac_mixer_unit_descriptor *d = p; size_t len; if (d->bLength < sizeof(*d) || !d->bNrInPins) return false; len = sizeof(*d) + d->bNrInPins; /* We can't determine the bitmap size only from this unit descriptor, * so just check with the remaining length. * The actual bitmap is checked at mixer unit parser. */ switch (v->protocol) { case UAC_VERSION_1: default: len += 2 + 1; /* wChannelConfig, iChannelNames */ /* bmControls[n*m] */ len += 1; /* iMixer */ break; case UAC_VERSION_2: len += 4 + 1; /* bmChannelConfig, iChannelNames */ /* bmMixerControls[n*m] */ len += 1 + 1; /* bmControls, iMixer */ break; case UAC_VERSION_3: len += 2; /* wClusterDescrID */ /* bmMixerControls[n*m] */ break; } return d->bLength >= len; } /* both for processing and extension units; covering all UACs */ static bool validate_processing_unit(const void *p, const struct usb_desc_validator *v) { const struct uac_processing_unit_descriptor *d = p; const unsigned char *hdr = p; size_t len, m; if (d->bLength < sizeof(*d)) return false; len = sizeof(*d) + d->bNrInPins; if (d->bLength < len) return false; switch (v->protocol) { case UAC_VERSION_1: default: /* bNrChannels, wChannelConfig, iChannelNames */ len += 1 + 2 + 1; if (d->bLength < len + 1) /* bControlSize */ return false; m = hdr[len]; len += 1 + m + 1; /* bControlSize, bmControls, iProcessing */ break; case UAC_VERSION_2: /* bNrChannels, bmChannelConfig, iChannelNames */ len += 1 + 4 + 1; if (v->type == UAC2_PROCESSING_UNIT_V2) len += 2; /* bmControls -- 2 bytes for PU */ else len += 1; /* bmControls -- 1 byte for EU */ len += 1; /* iProcessing */ break; case UAC_VERSION_3: /* wProcessingDescrStr, bmControls */ len += 2 + 4; break; } if (d->bLength < len) return false; switch (v->protocol) { case UAC_VERSION_1: default: if (v->type == UAC1_EXTENSION_UNIT) return true; /* OK */ switch (le16_to_cpu(d->wProcessType)) { case UAC_PROCESS_UP_DOWNMIX: case UAC_PROCESS_DOLBY_PROLOGIC: if (d->bLength < len + 1) /* bNrModes */ return false; m = hdr[len]; len += 1 + m * 2; /* bNrModes, waModes(n) */ break; default: break; } break; case UAC_VERSION_2: if (v->type == UAC2_EXTENSION_UNIT_V2) return true; /* OK */ switch (le16_to_cpu(d->wProcessType)) { case UAC2_PROCESS_UP_DOWNMIX: case UAC2_PROCESS_DOLBY_PROLOCIC: /* SiC! */ if (d->bLength < len + 1) /* bNrModes */ return false; m = hdr[len]; len += 1 + m * 4; /* bNrModes, daModes(n) */ break; default: break; } break; case UAC_VERSION_3: if (v->type == UAC3_EXTENSION_UNIT) { len += 2; /* wClusterDescrID */ break; } switch (le16_to_cpu(d->wProcessType)) { case UAC3_PROCESS_UP_DOWNMIX: if (d->bLength < len + 1) /* bNrModes */ return false; m = hdr[len]; len += 1 + m * 2; /* bNrModes, waClusterDescrID(n) */ break; case UAC3_PROCESS_MULTI_FUNCTION: len += 2 + 4; /* wClusterDescrID, bmAlgorighms */ break; default: break; } break; } if (d->bLength < len) return false; return true; } /* both for selector and clock selector units; covering all UACs */ static bool validate_selector_unit(const void *p, const struct usb_desc_validator *v) { const struct uac_selector_unit_descriptor *d = p; size_t len; if (d->bLength < sizeof(*d)) return false; len = sizeof(*d) + d->bNrInPins; switch (v->protocol) { case UAC_VERSION_1: default: len += 1; /* iSelector */ break; case UAC_VERSION_2: len += 1 + 1; /* bmControls, iSelector */ break; case UAC_VERSION_3: len += 4 + 2; /* bmControls, wSelectorDescrStr */ break; } return d->bLength >= len; } static bool validate_uac1_feature_unit(const void *p, const struct usb_desc_validator *v) { const struct uac_feature_unit_descriptor *d = p; if (d->bLength < sizeof(*d) || !d->bControlSize) return false; /* at least bmaControls(0) for master channel + iFeature */ return d->bLength >= sizeof(*d) + d->bControlSize + 1; } static bool validate_uac2_feature_unit(const void *p, const struct usb_desc_validator *v) { const struct uac2_feature_unit_descriptor *d = p; if (d->bLength < sizeof(*d)) return false; /* at least bmaControls(0) for master channel + iFeature */ return d->bLength >= sizeof(*d) + 4 + 1; } static bool validate_uac3_feature_unit(const void *p, const struct usb_desc_validator *v) { const struct uac3_feature_unit_descriptor *d = p; if (d->bLength < sizeof(*d)) return false; /* at least bmaControls(0) for master channel + wFeatureDescrStr */ return d->bLength >= sizeof(*d) + 4 + 2; } static bool validate_midi_out_jack(const void *p, const struct usb_desc_validator *v) { const struct usb_midi_out_jack_descriptor *d = p; return d->bLength >= sizeof(*d) && d->bLength >= sizeof(*d) + d->bNrInputPins * 2; } #define FIXED(p, t, s) { .protocol = (p), .type = (t), .size = sizeof(s) } #define FUNC(p, t, f) { .protocol = (p), .type = (t), .func = (f) } static const struct usb_desc_validator audio_validators[] = { /* UAC1 */ FUNC(UAC_VERSION_1, UAC_HEADER, validate_uac1_header), FIXED(UAC_VERSION_1, UAC_INPUT_TERMINAL, struct uac_input_terminal_descriptor), FIXED(UAC_VERSION_1, UAC_OUTPUT_TERMINAL, struct uac1_output_terminal_descriptor), FUNC(UAC_VERSION_1, UAC_MIXER_UNIT, validate_mixer_unit), FUNC(UAC_VERSION_1, UAC_SELECTOR_UNIT, validate_selector_unit), FUNC(UAC_VERSION_1, UAC_FEATURE_UNIT, validate_uac1_feature_unit), FUNC(UAC_VERSION_1, UAC1_PROCESSING_UNIT, validate_processing_unit), FUNC(UAC_VERSION_1, UAC1_EXTENSION_UNIT, validate_processing_unit), /* UAC2 */ FIXED(UAC_VERSION_2, UAC_HEADER, struct uac2_ac_header_descriptor), FIXED(UAC_VERSION_2, UAC_INPUT_TERMINAL, struct uac2_input_terminal_descriptor), FIXED(UAC_VERSION_2, UAC_OUTPUT_TERMINAL, struct uac2_output_terminal_descriptor), FUNC(UAC_VERSION_2, UAC_MIXER_UNIT, validate_mixer_unit), FUNC(UAC_VERSION_2, UAC_SELECTOR_UNIT, validate_selector_unit), FUNC(UAC_VERSION_2, UAC_FEATURE_UNIT, validate_uac2_feature_unit), /* UAC_VERSION_2, UAC2_EFFECT_UNIT: not implemented yet */ FUNC(UAC_VERSION_2, UAC2_PROCESSING_UNIT_V2, validate_processing_unit), FUNC(UAC_VERSION_2, UAC2_EXTENSION_UNIT_V2, validate_processing_unit), FIXED(UAC_VERSION_2, UAC2_CLOCK_SOURCE, struct uac_clock_source_descriptor), FUNC(UAC_VERSION_2, UAC2_CLOCK_SELECTOR, validate_selector_unit), FIXED(UAC_VERSION_2, UAC2_CLOCK_MULTIPLIER, struct uac_clock_multiplier_descriptor), /* UAC_VERSION_2, UAC2_SAMPLE_RATE_CONVERTER: not implemented yet */ /* UAC3 */ FIXED(UAC_VERSION_2, UAC_HEADER, struct uac3_ac_header_descriptor), FIXED(UAC_VERSION_3, UAC_INPUT_TERMINAL, struct uac3_input_terminal_descriptor), FIXED(UAC_VERSION_3, UAC_OUTPUT_TERMINAL, struct uac3_output_terminal_descriptor), /* UAC_VERSION_3, UAC3_EXTENDED_TERMINAL: not implemented yet */ FUNC(UAC_VERSION_3, UAC3_MIXER_UNIT, validate_mixer_unit), FUNC(UAC_VERSION_3, UAC3_SELECTOR_UNIT, validate_selector_unit), FUNC(UAC_VERSION_3, UAC_FEATURE_UNIT, validate_uac3_feature_unit), /* UAC_VERSION_3, UAC3_EFFECT_UNIT: not implemented yet */ FUNC(UAC_VERSION_3, UAC3_PROCESSING_UNIT, validate_processing_unit), FUNC(UAC_VERSION_3, UAC3_EXTENSION_UNIT, validate_processing_unit), FIXED(UAC_VERSION_3, UAC3_CLOCK_SOURCE, struct uac3_clock_source_descriptor), FUNC(UAC_VERSION_3, UAC3_CLOCK_SELECTOR, validate_selector_unit), FIXED(UAC_VERSION_3, UAC3_CLOCK_MULTIPLIER, struct uac3_clock_multiplier_descriptor), /* UAC_VERSION_3, UAC3_SAMPLE_RATE_CONVERTER: not implemented yet */ /* UAC_VERSION_3, UAC3_CONNECTORS: not implemented yet */ { } /* terminator */ }; static const struct usb_desc_validator midi_validators[] = { FIXED(UAC_VERSION_ALL, USB_MS_HEADER, struct usb_ms_header_descriptor), FIXED(UAC_VERSION_ALL, USB_MS_MIDI_IN_JACK, struct usb_midi_in_jack_descriptor), FUNC(UAC_VERSION_ALL, USB_MS_MIDI_OUT_JACK, validate_midi_out_jack), { } /* terminator */ }; /* Validate the given unit descriptor, return true if it's OK */ static bool validate_desc(unsigned char *hdr, int protocol, const struct usb_desc_validator *v) { if (hdr[1] != USB_DT_CS_INTERFACE) return true; /* don't care */ for (; v->type; v++) { if (v->type == hdr[2] && (v->protocol == UAC_VERSION_ALL || v->protocol == protocol)) { if (v->func) return v->func(hdr, v); /* check for the fixed size */ return hdr[0] >= v->size; } } return true; /* not matching, skip validation */ } bool snd_usb_validate_audio_desc(void *p, int protocol) { unsigned char *c = p; bool valid; valid = validate_desc(p, protocol, audio_validators); if (!valid && snd_usb_skip_validation) { print_hex_dump(KERN_ERR, "USB-audio: buggy audio desc: ", DUMP_PREFIX_NONE, 16, 1, c, c[0], true); valid = true; } return valid; } bool snd_usb_validate_midi_desc(void *p) { unsigned char *c = p; bool valid; valid = validate_desc(p, UAC_VERSION_1, midi_validators); if (!valid && snd_usb_skip_validation) { print_hex_dump(KERN_ERR, "USB-audio: buggy midi desc: ", DUMP_PREFIX_NONE, 16, 1, c, c[0], true); valid = true; } return valid; }
11 11 11 33 30 53 43 44 44 6 44 44 13 40 7 40 40 40 12 40 17 54 24 34 34 18 34 35 1 4 23 24 55 55 17 55 44 44 24 33 4 2 34 8 34 32 33 35 35 11 3 3 21 11 1 18 34 33 35 10 11 11 11 1 11 56 21 35 8 2 35 8 62 26 37 29 29 2 2 2 2 31 2 29 31 2 29 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/cpumask.h> #include <linux/spinlock.h> #include <linux/percpu.h> #include "bpf_lru_list.h" #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) static int get_next_cpu(int cpu) { cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_possible_mask); return cpu; } /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_FREE_LIST_IDX]; } static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; } /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { return READ_ONCE(node->ref); } static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) { WRITE_ONCE(node->ref, 0); } static void bpf_lru_list_count_inc(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]++; } static void bpf_lru_list_count_dec(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]--; } static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, struct bpf_lru_node *node, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; /* If the removing node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; list_move(&node->list, free_list); } /* Move nodes from local list to the LRU list */ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); list_move(&node->list, &l->lists[tgt_type]); } /* Move nodes between or within active and inactive list (like * active to inactive, inactive to active or tail of active back to * the head of active). */ static void __bpf_lru_node_move(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; if (node->type != tgt_type) { bpf_lru_list_count_dec(l, node->type); bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; } bpf_lru_node_clear_ref(node); /* If the moving node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; list_move(&node->list, &l->lists[tgt_type]); } static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) { return l->counts[BPF_LRU_LIST_T_INACTIVE] < l->counts[BPF_LRU_LIST_T_ACTIVE]; } /* Rotate the active list: * 1. Start from tail * 2. If the node has the ref bit set, it will be rotated * back to the head of active list with the ref bit cleared. * Give this node one more chance to survive in the active list. * 3. If the ref bit is not set, move it to the head of the * inactive list. * 4. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; struct bpf_lru_node *node, *tmp_node, *first_node; unsigned int i = 0; first_node = list_first_entry(active, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, active, list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); if (++i == lru->nr_scans || node == first_node) break; } } /* Rotate the inactive list. It starts from the next_inactive_rotation * 1. If the node has ref bit set, it will be moved to the head * of active list with the ref bit cleared. * 2. If the node does not have ref bit set, it will leave it * at its current location (i.e. do nothing) so that it can * be considered during the next inactive_shrink. * 3. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct list_head *cur, *last, *next = inactive; struct bpf_lru_node *node; unsigned int i = 0; if (list_empty(inactive)) return; last = l->next_inactive_rotation->next; if (last == inactive) last = last->next; cur = l->next_inactive_rotation; while (i < lru->nr_scans) { if (cur == inactive) { cur = cur->prev; continue; } node = list_entry(cur, struct bpf_lru_node, list); next = cur->prev; if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); if (cur == last) break; cur = next; i++; } l->next_inactive_rotation = next; } /* Shrink the inactive list. It starts from the tail of the * inactive list and only move the nodes without the ref bit * set to the designated free list. */ static unsigned int __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); } else if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) break; } if (++i == lru->nr_scans) break; } return nshrinked; } /* 1. Rotate the active list (if needed) * 2. Always rotate the inactive list */ static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) { if (bpf_lru_list_inactive_low(l)) __bpf_lru_list_rotate_active(lru, l); __bpf_lru_list_rotate_inactive(lru, l); } /* Calls __bpf_lru_list_shrink_inactive() to shrink some * ref-bit-cleared nodes and move them to the designated * free list. * * If it cannot get a free node after calling * __bpf_lru_list_shrink_inactive(). It will just remove * one node from either inactive or active list without * honoring the ref-bit. It prefers inactive list to active * list in this situation. */ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct bpf_lru_node *node, *tmp_node; struct list_head *force_shrink_list; unsigned int nshrinked; nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, free_list, tgt_free_type); if (nshrinked) return nshrinked; /* Do a force shrink by ignoring the reference bit */ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; else force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; } } return 0; } /* Flush the nodes from the local pending list to the LRU list */ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, local_pending_list(loc_l), list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_INACTIVE); } } static void bpf_lru_list_push_free(struct bpf_lru_list *l, struct bpf_lru_node *node) { unsigned long flags; if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; raw_spin_lock(&l->lock); __local_list_flush(l, loc_l); __bpf_lru_list_rotate(lru, l); list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == LOCAL_FREE_TARGET) break; } if (nfree < LOCAL_FREE_TARGET) __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); raw_spin_unlock(&l->lock); } static void __local_list_add_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l, int cpu, struct bpf_lru_node *node, u32 hash) { *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; bpf_lru_node_clear_ref(node); list_add(&node->list, local_pending_list(loc_l)); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; node = list_first_entry_or_null(local_free_list(loc_l), struct bpf_lru_node, list); if (node) list_del(&node->list); return node; } static struct bpf_lru_node * __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ list_for_each_entry_reverse(node, local_pending_list(loc_l), list) { if ((!bpf_lru_node_is_ref(node) || force) && lru->del_from_htab(lru->del_arg, node)) { list_del(&node->list); return node; } } if (!force) { force = true; goto ignore_ref; } return NULL; } static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct list_head *free_list; struct bpf_lru_node *node = NULL; struct bpf_lru_list *l; unsigned long flags; int cpu = raw_smp_processor_id(); l = per_cpu_ptr(lru->percpu_lru, cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_list_rotate(lru, l); free_list = &l->lists[BPF_LRU_LIST_T_FREE]; if (list_empty(free_list)) __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, BPF_LRU_LIST_T_FREE); if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); *(u32 *)((void *)node + lru->hash_offset) = hash; bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } raw_spin_unlock_irqrestore(&l->lock, flags); return node; } static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct bpf_lru_locallist *loc_l, *steal_loc_l; struct bpf_common_lru *clru = &lru->common_lru; struct bpf_lru_node *node; int steal, first_steal; unsigned long flags; int cpu = raw_smp_processor_id(); loc_l = per_cpu_ptr(clru->local_list, cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); node = __local_list_pop_free(loc_l); if (!node) { bpf_lru_list_pop_free_to_local(lru, loc_l); node = __local_list_pop_free(loc_l); } if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; /* No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. */ first_steal = loc_l->next_steal; steal = first_steal; do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); raw_spin_lock_irqsave(&steal_loc_l->lock, flags); node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l); raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); steal = get_next_cpu(steal); } while (!node && steal != first_steal); loc_l->next_steal = steal; if (node) { raw_spin_lock_irqsave(&loc_l->lock, flags); __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); } return node; } struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) { if (lru->percpu) return bpf_percpu_lru_pop_free(lru, hash); else return bpf_common_lru_pop_free(lru, hash); } static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { u8 node_type = READ_ONCE(node->type); unsigned long flags; if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { raw_spin_unlock_irqrestore(&loc_l->lock, flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); raw_spin_unlock_irqrestore(&loc_l->lock, flags); return; } check_lru_list: bpf_lru_list_push_free(&lru->common_lru.lru_list, node); } static void bpf_percpu_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { struct bpf_lru_list *l; unsigned long flags; l = per_cpu_ptr(lru->percpu_lru, node->cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { if (lru->percpu) bpf_percpu_lru_push_free(lru, node); else bpf_common_lru_push_free(lru, node); } static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; for (i = 0; i < nr_elems; i++) { struct bpf_lru_node *node; node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { u32 i, pcpu_entries; int cpu; struct bpf_lru_list *l; pcpu_entries = nr_elems / num_possible_cpus(); i = 0; for_each_possible_cpu(cpu) { struct bpf_lru_node *node; l = per_cpu_ptr(lru->percpu_lru, cpu); again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size; if (i == nr_elems) break; if (i % pcpu_entries) goto again; } } void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { if (lru->percpu) bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, nr_elems); else bpf_common_lru_populate(lru, buf, node_offset, elem_size, nr_elems); } static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { int i; for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) INIT_LIST_HEAD(&loc_l->lists[i]); loc_l->next_steal = cpu; raw_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) { int i; for (i = 0; i < NR_BPF_LRU_LIST_T; i++) INIT_LIST_HEAD(&l->lists[i]); for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) l->counts[i] = 0; l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; raw_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *del_arg) { int cpu; if (percpu) { lru->percpu_lru = alloc_percpu(struct bpf_lru_list); if (!lru->percpu_lru) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_list *l; l = per_cpu_ptr(lru->percpu_lru, cpu); bpf_lru_list_init(l); } lru->nr_scans = PERCPU_NR_SCANS; } else { struct bpf_common_lru *clru = &lru->common_lru; clru->local_list = alloc_percpu(struct bpf_lru_locallist); if (!clru->local_list) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(clru->local_list, cpu); bpf_lru_locallist_init(loc_l, cpu); } bpf_lru_list_init(&clru->lru_list); lru->nr_scans = LOCAL_NR_SCANS; } lru->percpu = percpu; lru->del_from_htab = del_from_htab; lru->del_arg = del_arg; lru->hash_offset = hash_offset; return 0; } void bpf_lru_destroy(struct bpf_lru *lru) { if (lru->percpu) free_percpu(lru->percpu_lru); else free_percpu(lru->common_lru.local_list); }
132 131 146 147 137 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 // SPDX-License-Identifier: GPL-2.0+ /** * DOC: vkms (Virtual Kernel Modesetting) * * VKMS is a software-only model of a KMS driver that is useful for testing * and for running X (or similar) on headless machines. VKMS aims to enable * a virtual display with no need of a hardware display capability, releasing * the GPU in DRM API tests. */ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/dma-mapping.h> #include <drm/drm_gem.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_drv.h> #include <drm/drm_fbdev_shmem.h> #include <drm/drm_file.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_probe_helper.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_vblank.h> #include "vkms_drv.h" #include <drm/drm_print.h> #include <drm/drm_debugfs.h> #define DRIVER_NAME "vkms" #define DRIVER_DESC "Virtual Kernel Mode Setting" #define DRIVER_DATE "20180514" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 static struct vkms_config *default_config; static bool enable_cursor = true; module_param_named(enable_cursor, enable_cursor, bool, 0444); MODULE_PARM_DESC(enable_cursor, "Enable/Disable cursor support"); static bool enable_writeback = true; module_param_named(enable_writeback, enable_writeback, bool, 0444); MODULE_PARM_DESC(enable_writeback, "Enable/Disable writeback connector support"); static bool enable_overlay; module_param_named(enable_overlay, enable_overlay, bool, 0444); MODULE_PARM_DESC(enable_overlay, "Enable/Disable overlay support"); DEFINE_DRM_GEM_FOPS(vkms_driver_fops); static void vkms_release(struct drm_device *dev) { struct vkms_device *vkms = drm_device_to_vkms_device(dev); if (vkms->output.composer_workq) destroy_workqueue(vkms->output.composer_workq); } static void vkms_atomic_commit_tail(struct drm_atomic_state *old_state) { struct drm_device *dev = old_state->dev; struct drm_crtc *crtc; struct drm_crtc_state *old_crtc_state; int i; drm_atomic_helper_commit_modeset_disables(dev, old_state); drm_atomic_helper_commit_planes(dev, old_state, 0); drm_atomic_helper_commit_modeset_enables(dev, old_state); drm_atomic_helper_fake_vblank(old_state); drm_atomic_helper_commit_hw_done(old_state); drm_atomic_helper_wait_for_flip_done(dev, old_state); for_each_old_crtc_in_state(old_state, crtc, old_crtc_state, i) { struct vkms_crtc_state *vkms_state = to_vkms_crtc_state(old_crtc_state); flush_work(&vkms_state->composer_work); } drm_atomic_helper_cleanup_planes(dev, old_state); } static int vkms_config_show(struct seq_file *m, void *data) { struct drm_debugfs_entry *entry = m->private; struct drm_device *dev = entry->dev; struct vkms_device *vkmsdev = drm_device_to_vkms_device(dev); seq_printf(m, "writeback=%d\n", vkmsdev->config->writeback); seq_printf(m, "cursor=%d\n", vkmsdev->config->cursor); seq_printf(m, "overlay=%d\n", vkmsdev->config->overlay); return 0; } static const struct drm_debugfs_info vkms_config_debugfs_list[] = { { "vkms_config", vkms_config_show, 0 }, }; static const struct drm_driver vkms_driver = { .driver_features = DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_GEM, .release = vkms_release, .fops = &vkms_driver_fops, DRM_GEM_SHMEM_DRIVER_OPS, .name = DRIVER_NAME, .desc = DRIVER_DESC, .date = DRIVER_DATE, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, }; static int vkms_atomic_check(struct drm_device *dev, struct drm_atomic_state *state) { struct drm_crtc *crtc; struct drm_crtc_state *new_crtc_state; int i; for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) { if (!new_crtc_state->gamma_lut || !new_crtc_state->color_mgmt_changed) continue; if (new_crtc_state->gamma_lut->length / sizeof(struct drm_color_lut *) > VKMS_LUT_SIZE) return -EINVAL; } return drm_atomic_helper_check(dev, state); } static const struct drm_mode_config_funcs vkms_mode_funcs = { .fb_create = drm_gem_fb_create, .atomic_check = vkms_atomic_check, .atomic_commit = drm_atomic_helper_commit, }; static const struct drm_mode_config_helper_funcs vkms_mode_config_helpers = { .atomic_commit_tail = vkms_atomic_commit_tail, }; static int vkms_modeset_init(struct vkms_device *vkmsdev) { struct drm_device *dev = &vkmsdev->drm; int ret; ret = drmm_mode_config_init(dev); if (ret) return ret; dev->mode_config.funcs = &vkms_mode_funcs; dev->mode_config.min_width = XRES_MIN; dev->mode_config.min_height = YRES_MIN; dev->mode_config.max_width = XRES_MAX; dev->mode_config.max_height = YRES_MAX; dev->mode_config.cursor_width = 512; dev->mode_config.cursor_height = 512; /* FIXME: There's a confusion between bpp and depth between this and * fbdev helpers. We have to go with 0, meaning "pick the default", * which ix XRGB8888 in all cases. */ dev->mode_config.preferred_depth = 0; dev->mode_config.helper_private = &vkms_mode_config_helpers; return vkms_output_init(vkmsdev, 0); } static int vkms_create(struct vkms_config *config) { int ret; struct platform_device *pdev; struct vkms_device *vkms_device; pdev = platform_device_register_simple(DRIVER_NAME, -1, NULL, 0); if (IS_ERR(pdev)) return PTR_ERR(pdev); if (!devres_open_group(&pdev->dev, NULL, GFP_KERNEL)) { ret = -ENOMEM; goto out_unregister; } vkms_device = devm_drm_dev_alloc(&pdev->dev, &vkms_driver, struct vkms_device, drm); if (IS_ERR(vkms_device)) { ret = PTR_ERR(vkms_device); goto out_devres; } vkms_device->platform = pdev; vkms_device->config = config; config->dev = vkms_device; ret = dma_coerce_mask_and_coherent(vkms_device->drm.dev, DMA_BIT_MASK(64)); if (ret) { DRM_ERROR("Could not initialize DMA support\n"); goto out_devres; } ret = drm_vblank_init(&vkms_device->drm, 1); if (ret) { DRM_ERROR("Failed to vblank\n"); goto out_devres; } ret = vkms_modeset_init(vkms_device); if (ret) goto out_devres; drm_debugfs_add_files(&vkms_device->drm, vkms_config_debugfs_list, ARRAY_SIZE(vkms_config_debugfs_list)); ret = drm_dev_register(&vkms_device->drm, 0); if (ret) goto out_devres; drm_fbdev_shmem_setup(&vkms_device->drm, 0); return 0; out_devres: devres_release_group(&pdev->dev, NULL); out_unregister: platform_device_unregister(pdev); return ret; } static int __init vkms_init(void) { int ret; struct vkms_config *config; config = kmalloc(sizeof(*config), GFP_KERNEL); if (!config) return -ENOMEM; default_config = config; config->cursor = enable_cursor; config->writeback = enable_writeback; config->overlay = enable_overlay; ret = vkms_create(config); if (ret) kfree(config); return ret; } static void vkms_destroy(struct vkms_config *config) { struct platform_device *pdev; if (!config->dev) { DRM_INFO("vkms_device is NULL.\n"); return; } pdev = config->dev->platform; drm_dev_unregister(&config->dev->drm); drm_atomic_helper_shutdown(&config->dev->drm); devres_release_group(&pdev->dev, NULL); platform_device_unregister(pdev); config->dev = NULL; } static void __exit vkms_exit(void) { if (default_config->dev) vkms_destroy(default_config); kfree(default_config); } module_init(vkms_init); module_exit(vkms_exit); MODULE_AUTHOR("Haneen Mohammed <hamohammed.sa@gmail.com>"); MODULE_AUTHOR("Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 // SPDX-License-Identifier: GPL-2.0-only /* * Driver for DiBcom DiB3000MC/P-demodulator. * * Copyright (C) 2004-7 DiBcom (http://www.dibcom.fr/) * Copyright (C) 2004-5 Patrick Boettcher (patrick.boettcher@posteo.de) * * This code is partially based on the previous dib3000mc.c . */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/i2c.h> #include <media/dvb_frontend.h> #include "dib3000mc.h" static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "turn on debugging (default: 0)"); static int buggy_sfn_workaround; module_param(buggy_sfn_workaround, int, 0644); MODULE_PARM_DESC(buggy_sfn_workaround, "Enable work-around for buggy SFNs (default: 0)"); #define dprintk(fmt, arg...) do { \ if (debug) \ printk(KERN_DEBUG pr_fmt("%s: " fmt), \ __func__, ##arg); \ } while (0) struct dib3000mc_state { struct dvb_frontend demod; struct dib3000mc_config *cfg; u8 i2c_addr; struct i2c_adapter *i2c_adap; struct dibx000_i2c_master i2c_master; u32 timf; u32 current_bandwidth; u16 dev_id; u8 sfn_workaround_active :1; }; static u16 dib3000mc_read_word(struct dib3000mc_state *state, u16 reg) { struct i2c_msg msg[2] = { { .addr = state->i2c_addr >> 1, .flags = 0, .len = 2 }, { .addr = state->i2c_addr >> 1, .flags = I2C_M_RD, .len = 2 }, }; u16 word; u8 *b; b = kmalloc(4, GFP_KERNEL); if (!b) return 0; b[0] = (reg >> 8) | 0x80; b[1] = reg; b[2] = 0; b[3] = 0; msg[0].buf = b; msg[1].buf = b + 2; if (i2c_transfer(state->i2c_adap, msg, 2) != 2) dprintk("i2c read error on %d\n",reg); word = (b[2] << 8) | b[3]; kfree(b); return word; } static int dib3000mc_write_word(struct dib3000mc_state *state, u16 reg, u16 val) { struct i2c_msg msg = { .addr = state->i2c_addr >> 1, .flags = 0, .len = 4 }; int rc; u8 *b; b = kmalloc(4, GFP_KERNEL); if (!b) return -ENOMEM; b[0] = reg >> 8; b[1] = reg; b[2] = val >> 8; b[3] = val; msg.buf = b; rc = i2c_transfer(state->i2c_adap, &msg, 1) != 1 ? -EREMOTEIO : 0; kfree(b); return rc; } static int dib3000mc_identify(struct dib3000mc_state *state) { u16 value; if ((value = dib3000mc_read_word(state, 1025)) != 0x01b3) { dprintk("-E- DiB3000MC/P: wrong Vendor ID (read=0x%x)\n",value); return -EREMOTEIO; } value = dib3000mc_read_word(state, 1026); if (value != 0x3001 && value != 0x3002) { dprintk("-E- DiB3000MC/P: wrong Device ID (%x)\n",value); return -EREMOTEIO; } state->dev_id = value; dprintk("-I- found DiB3000MC/P: %x\n",state->dev_id); return 0; } static int dib3000mc_set_timing(struct dib3000mc_state *state, s16 nfft, u32 bw, u8 update_offset) { u32 timf; if (state->timf == 0) { timf = 1384402; // default value for 8MHz if (update_offset) msleep(200); // first time we do an update } else timf = state->timf; timf *= (bw / 1000); if (update_offset) { s16 tim_offs = dib3000mc_read_word(state, 416); if (tim_offs & 0x2000) tim_offs -= 0x4000; if (nfft == TRANSMISSION_MODE_2K) tim_offs *= 4; timf += tim_offs; state->timf = timf / (bw / 1000); } dprintk("timf: %d\n", timf); dib3000mc_write_word(state, 23, (u16) (timf >> 16)); dib3000mc_write_word(state, 24, (u16) (timf ) & 0xffff); return 0; } static int dib3000mc_setup_pwm_state(struct dib3000mc_state *state) { u16 reg_51, reg_52 = state->cfg->agc->setup & 0xfefb; if (state->cfg->pwm3_inversion) { reg_51 = (2 << 14) | (0 << 10) | (7 << 6) | (2 << 2) | (2 << 0); reg_52 |= (1 << 2); } else { reg_51 = (2 << 14) | (4 << 10) | (7 << 6) | (2 << 2) | (2 << 0); reg_52 |= (1 << 8); } dib3000mc_write_word(state, 51, reg_51); dib3000mc_write_word(state, 52, reg_52); if (state->cfg->use_pwm3) dib3000mc_write_word(state, 245, (1 << 3) | (1 << 0)); else dib3000mc_write_word(state, 245, 0); dib3000mc_write_word(state, 1040, 0x3); return 0; } static int dib3000mc_set_output_mode(struct dib3000mc_state *state, int mode) { int ret = 0; u16 fifo_threshold = 1792; u16 outreg = 0; u16 outmode = 0; u16 elecout = 1; u16 smo_reg = dib3000mc_read_word(state, 206) & 0x0010; /* keep the pid_parse bit */ dprintk("-I- Setting output mode for demod %p to %d\n", &state->demod, mode); switch (mode) { case OUTMODE_HIGH_Z: // disable elecout = 0; break; case OUTMODE_MPEG2_PAR_GATED_CLK: // STBs with parallel gated clock outmode = 0; break; case OUTMODE_MPEG2_PAR_CONT_CLK: // STBs with parallel continues clock outmode = 1; break; case OUTMODE_MPEG2_SERIAL: // STBs with serial input outmode = 2; break; case OUTMODE_MPEG2_FIFO: // e.g. USB feeding elecout = 3; /*ADDR @ 206 : P_smo_error_discard [1;6:6] = 0 P_smo_rs_discard [1;5:5] = 0 P_smo_pid_parse [1;4:4] = 0 P_smo_fifo_flush [1;3:3] = 0 P_smo_mode [2;2:1] = 11 P_smo_ovf_prot [1;0:0] = 0 */ smo_reg |= 3 << 1; fifo_threshold = 512; outmode = 5; break; case OUTMODE_DIVERSITY: outmode = 4; elecout = 1; break; default: dprintk("Unhandled output_mode passed to be set for demod %p\n",&state->demod); outmode = 0; break; } if ((state->cfg->output_mpeg2_in_188_bytes)) smo_reg |= (1 << 5); // P_smo_rs_discard [1;5:5] = 1 outreg = dib3000mc_read_word(state, 244) & 0x07FF; outreg |= (outmode << 11); ret |= dib3000mc_write_word(state, 244, outreg); ret |= dib3000mc_write_word(state, 206, smo_reg); /*smo_ mode*/ ret |= dib3000mc_write_word(state, 207, fifo_threshold); /* synchronous fread */ ret |= dib3000mc_write_word(state, 1040, elecout); /* P_out_cfg */ return ret; } static int dib3000mc_set_bandwidth(struct dib3000mc_state *state, u32 bw) { u16 bw_cfg[6] = { 0 }; u16 imp_bw_cfg[3] = { 0 }; u16 reg; /* settings here are for 27.7MHz */ switch (bw) { case 8000: bw_cfg[0] = 0x0019; bw_cfg[1] = 0x5c30; bw_cfg[2] = 0x0054; bw_cfg[3] = 0x88a0; bw_cfg[4] = 0x01a6; bw_cfg[5] = 0xab20; imp_bw_cfg[0] = 0x04db; imp_bw_cfg[1] = 0x00db; imp_bw_cfg[2] = 0x00b7; break; case 7000: bw_cfg[0] = 0x001c; bw_cfg[1] = 0xfba5; bw_cfg[2] = 0x0060; bw_cfg[3] = 0x9c25; bw_cfg[4] = 0x01e3; bw_cfg[5] = 0x0cb7; imp_bw_cfg[0] = 0x04c0; imp_bw_cfg[1] = 0x00c0; imp_bw_cfg[2] = 0x00a0; break; case 6000: bw_cfg[0] = 0x0021; bw_cfg[1] = 0xd040; bw_cfg[2] = 0x0070; bw_cfg[3] = 0xb62b; bw_cfg[4] = 0x0233; bw_cfg[5] = 0x8ed5; imp_bw_cfg[0] = 0x04a5; imp_bw_cfg[1] = 0x00a5; imp_bw_cfg[2] = 0x0089; break; case 5000: bw_cfg[0] = 0x0028; bw_cfg[1] = 0x9380; bw_cfg[2] = 0x0087; bw_cfg[3] = 0x4100; bw_cfg[4] = 0x02a4; bw_cfg[5] = 0x4500; imp_bw_cfg[0] = 0x0489; imp_bw_cfg[1] = 0x0089; imp_bw_cfg[2] = 0x0072; break; default: return -EINVAL; } for (reg = 6; reg < 12; reg++) dib3000mc_write_word(state, reg, bw_cfg[reg - 6]); dib3000mc_write_word(state, 12, 0x0000); dib3000mc_write_word(state, 13, 0x03e8); dib3000mc_write_word(state, 14, 0x0000); dib3000mc_write_word(state, 15, 0x03f2); dib3000mc_write_word(state, 16, 0x0001); dib3000mc_write_word(state, 17, 0xb0d0); // P_sec_len dib3000mc_write_word(state, 18, 0x0393); dib3000mc_write_word(state, 19, 0x8700); for (reg = 55; reg < 58; reg++) dib3000mc_write_word(state, reg, imp_bw_cfg[reg - 55]); // Timing configuration dib3000mc_set_timing(state, TRANSMISSION_MODE_2K, bw, 0); return 0; } static u16 impulse_noise_val[29] = { 0x38, 0x6d9, 0x3f28, 0x7a7, 0x3a74, 0x196, 0x32a, 0x48c, 0x3ffe, 0x7f3, 0x2d94, 0x76, 0x53d, 0x3ff8, 0x7e3, 0x3320, 0x76, 0x5b3, 0x3feb, 0x7d2, 0x365e, 0x76, 0x48c, 0x3ffe, 0x5b3, 0x3feb, 0x76, 0x0000, 0xd }; static void dib3000mc_set_impulse_noise(struct dib3000mc_state *state, u8 mode, s16 nfft) { u16 i; for (i = 58; i < 87; i++) dib3000mc_write_word(state, i, impulse_noise_val[i-58]); if (nfft == TRANSMISSION_MODE_8K) { dib3000mc_write_word(state, 58, 0x3b); dib3000mc_write_word(state, 84, 0x00); dib3000mc_write_word(state, 85, 0x8200); } dib3000mc_write_word(state, 34, 0x1294); dib3000mc_write_word(state, 35, 0x1ff8); if (mode == 1) dib3000mc_write_word(state, 55, dib3000mc_read_word(state, 55) | (1 << 10)); } static int dib3000mc_init(struct dvb_frontend *demod) { struct dib3000mc_state *state = demod->demodulator_priv; struct dibx000_agc_config *agc = state->cfg->agc; // Restart Configuration dib3000mc_write_word(state, 1027, 0x8000); dib3000mc_write_word(state, 1027, 0x0000); // power up the demod + mobility configuration dib3000mc_write_word(state, 140, 0x0000); dib3000mc_write_word(state, 1031, 0); if (state->cfg->mobile_mode) { dib3000mc_write_word(state, 139, 0x0000); dib3000mc_write_word(state, 141, 0x0000); dib3000mc_write_word(state, 175, 0x0002); dib3000mc_write_word(state, 1032, 0x0000); } else { dib3000mc_write_word(state, 139, 0x0001); dib3000mc_write_word(state, 141, 0x0000); dib3000mc_write_word(state, 175, 0x0000); dib3000mc_write_word(state, 1032, 0x012C); } dib3000mc_write_word(state, 1033, 0x0000); // P_clk_cfg dib3000mc_write_word(state, 1037, 0x3130); // other configurations // P_ctrl_sfreq dib3000mc_write_word(state, 33, (5 << 0)); dib3000mc_write_word(state, 88, (1 << 10) | (0x10 << 0)); // Phase noise control // P_fft_phacor_inh, P_fft_phacor_cpe, P_fft_powrange dib3000mc_write_word(state, 99, (1 << 9) | (0x20 << 0)); if (state->cfg->phase_noise_mode == 0) dib3000mc_write_word(state, 111, 0x00); else dib3000mc_write_word(state, 111, 0x02); // P_agc_global dib3000mc_write_word(state, 50, 0x8000); // agc setup misc dib3000mc_setup_pwm_state(state); // P_agc_counter_lock dib3000mc_write_word(state, 53, 0x87); // P_agc_counter_unlock dib3000mc_write_word(state, 54, 0x87); /* agc */ dib3000mc_write_word(state, 36, state->cfg->max_time); dib3000mc_write_word(state, 37, (state->cfg->agc_command1 << 13) | (state->cfg->agc_command2 << 12) | (0x1d << 0)); dib3000mc_write_word(state, 38, state->cfg->pwm3_value); dib3000mc_write_word(state, 39, state->cfg->ln_adc_level); // set_agc_loop_Bw dib3000mc_write_word(state, 40, 0x0179); dib3000mc_write_word(state, 41, 0x03f0); dib3000mc_write_word(state, 42, agc->agc1_max); dib3000mc_write_word(state, 43, agc->agc1_min); dib3000mc_write_word(state, 44, agc->agc2_max); dib3000mc_write_word(state, 45, agc->agc2_min); dib3000mc_write_word(state, 46, (agc->agc1_pt1 << 8) | agc->agc1_pt2); dib3000mc_write_word(state, 47, (agc->agc1_slope1 << 8) | agc->agc1_slope2); dib3000mc_write_word(state, 48, (agc->agc2_pt1 << 8) | agc->agc2_pt2); dib3000mc_write_word(state, 49, (agc->agc2_slope1 << 8) | agc->agc2_slope2); // Begin: TimeOut registers // P_pha3_thres dib3000mc_write_word(state, 110, 3277); // P_timf_alpha = 6, P_corm_alpha = 6, P_corm_thres = 0x80 dib3000mc_write_word(state, 26, 0x6680); // lock_mask0 dib3000mc_write_word(state, 1, 4); // lock_mask1 dib3000mc_write_word(state, 2, 4); // lock_mask2 dib3000mc_write_word(state, 3, 0x1000); // P_search_maxtrial=1 dib3000mc_write_word(state, 5, 1); dib3000mc_set_bandwidth(state, 8000); // div_lock_mask dib3000mc_write_word(state, 4, 0x814); dib3000mc_write_word(state, 21, (1 << 9) | 0x164); dib3000mc_write_word(state, 22, 0x463d); // Spurious rm cfg // P_cspu_regul, P_cspu_win_cut dib3000mc_write_word(state, 120, 0x200f); // P_adp_selec_monit dib3000mc_write_word(state, 134, 0); // Fec cfg dib3000mc_write_word(state, 195, 0x10); // diversity register: P_dvsy_sync_wait.. dib3000mc_write_word(state, 180, 0x2FF0); // Impulse noise configuration dib3000mc_set_impulse_noise(state, 0, TRANSMISSION_MODE_8K); // output mode set-up dib3000mc_set_output_mode(state, OUTMODE_HIGH_Z); /* close the i2c-gate */ dib3000mc_write_word(state, 769, (1 << 7) ); return 0; } static int dib3000mc_sleep(struct dvb_frontend *demod) { struct dib3000mc_state *state = demod->demodulator_priv; dib3000mc_write_word(state, 1031, 0xFFFF); dib3000mc_write_word(state, 1032, 0xFFFF); dib3000mc_write_word(state, 1033, 0xFFF0); return 0; } static void dib3000mc_set_adp_cfg(struct dib3000mc_state *state, s16 qam) { u16 cfg[4] = { 0 },reg; switch (qam) { case QPSK: cfg[0] = 0x099a; cfg[1] = 0x7fae; cfg[2] = 0x0333; cfg[3] = 0x7ff0; break; case QAM_16: cfg[0] = 0x023d; cfg[1] = 0x7fdf; cfg[2] = 0x00a4; cfg[3] = 0x7ff0; break; case QAM_64: cfg[0] = 0x0148; cfg[1] = 0x7ff0; cfg[2] = 0x00a4; cfg[3] = 0x7ff8; break; } for (reg = 129; reg < 133; reg++) dib3000mc_write_word(state, reg, cfg[reg - 129]); } static void dib3000mc_set_channel_cfg(struct dib3000mc_state *state, struct dtv_frontend_properties *ch, u16 seq) { u16 value; u32 bw = BANDWIDTH_TO_KHZ(ch->bandwidth_hz); dib3000mc_set_bandwidth(state, bw); dib3000mc_set_timing(state, ch->transmission_mode, bw, 0); #if 1 dib3000mc_write_word(state, 100, (16 << 6) + 9); #else if (boost) dib3000mc_write_word(state, 100, (11 << 6) + 6); else dib3000mc_write_word(state, 100, (16 << 6) + 9); #endif dib3000mc_write_word(state, 1027, 0x0800); dib3000mc_write_word(state, 1027, 0x0000); //Default cfg isi offset adp dib3000mc_write_word(state, 26, 0x6680); dib3000mc_write_word(state, 29, 0x1273); dib3000mc_write_word(state, 33, 5); dib3000mc_set_adp_cfg(state, QAM_16); dib3000mc_write_word(state, 133, 15564); dib3000mc_write_word(state, 12 , 0x0); dib3000mc_write_word(state, 13 , 0x3e8); dib3000mc_write_word(state, 14 , 0x0); dib3000mc_write_word(state, 15 , 0x3f2); dib3000mc_write_word(state, 93,0); dib3000mc_write_word(state, 94,0); dib3000mc_write_word(state, 95,0); dib3000mc_write_word(state, 96,0); dib3000mc_write_word(state, 97,0); dib3000mc_write_word(state, 98,0); dib3000mc_set_impulse_noise(state, 0, ch->transmission_mode); value = 0; switch (ch->transmission_mode) { case TRANSMISSION_MODE_2K: value |= (0 << 7); break; default: case TRANSMISSION_MODE_8K: value |= (1 << 7); break; } switch (ch->guard_interval) { case GUARD_INTERVAL_1_32: value |= (0 << 5); break; case GUARD_INTERVAL_1_16: value |= (1 << 5); break; case GUARD_INTERVAL_1_4: value |= (3 << 5); break; default: case GUARD_INTERVAL_1_8: value |= (2 << 5); break; } switch (ch->modulation) { case QPSK: value |= (0 << 3); break; case QAM_16: value |= (1 << 3); break; default: case QAM_64: value |= (2 << 3); break; } switch (HIERARCHY_1) { case HIERARCHY_2: value |= 2; break; case HIERARCHY_4: value |= 4; break; default: case HIERARCHY_1: value |= 1; break; } dib3000mc_write_word(state, 0, value); dib3000mc_write_word(state, 5, (1 << 8) | ((seq & 0xf) << 4)); value = 0; if (ch->hierarchy == 1) value |= (1 << 4); if (1 == 1) value |= 1; switch ((ch->hierarchy == 0 || 1 == 1) ? ch->code_rate_HP : ch->code_rate_LP) { case FEC_2_3: value |= (2 << 1); break; case FEC_3_4: value |= (3 << 1); break; case FEC_5_6: value |= (5 << 1); break; case FEC_7_8: value |= (7 << 1); break; default: case FEC_1_2: value |= (1 << 1); break; } dib3000mc_write_word(state, 181, value); // diversity synchro delay add 50% SFN margin switch (ch->transmission_mode) { case TRANSMISSION_MODE_8K: value = 256; break; case TRANSMISSION_MODE_2K: default: value = 64; break; } switch (ch->guard_interval) { case GUARD_INTERVAL_1_16: value *= 2; break; case GUARD_INTERVAL_1_8: value *= 4; break; case GUARD_INTERVAL_1_4: value *= 8; break; default: case GUARD_INTERVAL_1_32: value *= 1; break; } value <<= 4; value |= dib3000mc_read_word(state, 180) & 0x000f; dib3000mc_write_word(state, 180, value); // restart demod value = dib3000mc_read_word(state, 0); dib3000mc_write_word(state, 0, value | (1 << 9)); dib3000mc_write_word(state, 0, value); msleep(30); dib3000mc_set_impulse_noise(state, state->cfg->impulse_noise_mode, ch->transmission_mode); } static int dib3000mc_autosearch_start(struct dvb_frontend *demod) { struct dtv_frontend_properties *chan = &demod->dtv_property_cache; struct dib3000mc_state *state = demod->demodulator_priv; u16 reg; // u32 val; struct dtv_frontend_properties schan; schan = *chan; /* TODO what is that ? */ /* a channel for autosearch */ schan.transmission_mode = TRANSMISSION_MODE_8K; schan.guard_interval = GUARD_INTERVAL_1_32; schan.modulation = QAM_64; schan.code_rate_HP = FEC_2_3; schan.code_rate_LP = FEC_2_3; schan.hierarchy = 0; dib3000mc_set_channel_cfg(state, &schan, 11); reg = dib3000mc_read_word(state, 0); dib3000mc_write_word(state, 0, reg | (1 << 8)); dib3000mc_read_word(state, 511); dib3000mc_write_word(state, 0, reg); return 0; } static int dib3000mc_autosearch_is_irq(struct dvb_frontend *demod) { struct dib3000mc_state *state = demod->demodulator_priv; u16 irq_pending = dib3000mc_read_word(state, 511); if (irq_pending & 0x1) // failed return 1; if (irq_pending & 0x2) // succeeded return 2; return 0; // still pending } static int dib3000mc_tune(struct dvb_frontend *demod) { struct dtv_frontend_properties *ch = &demod->dtv_property_cache; struct dib3000mc_state *state = demod->demodulator_priv; // ** configure demod ** dib3000mc_set_channel_cfg(state, ch, 0); // activates isi if (state->sfn_workaround_active) { dprintk("SFN workaround is active\n"); dib3000mc_write_word(state, 29, 0x1273); dib3000mc_write_word(state, 108, 0x4000); // P_pha3_force_pha_shift } else { dib3000mc_write_word(state, 29, 0x1073); dib3000mc_write_word(state, 108, 0x0000); // P_pha3_force_pha_shift } dib3000mc_set_adp_cfg(state, (u8)ch->modulation); if (ch->transmission_mode == TRANSMISSION_MODE_8K) { dib3000mc_write_word(state, 26, 38528); dib3000mc_write_word(state, 33, 8); } else { dib3000mc_write_word(state, 26, 30336); dib3000mc_write_word(state, 33, 6); } if (dib3000mc_read_word(state, 509) & 0x80) dib3000mc_set_timing(state, ch->transmission_mode, BANDWIDTH_TO_KHZ(ch->bandwidth_hz), 1); return 0; } struct i2c_adapter * dib3000mc_get_tuner_i2c_master(struct dvb_frontend *demod, int gating) { struct dib3000mc_state *st = demod->demodulator_priv; return dibx000_get_i2c_adapter(&st->i2c_master, DIBX000_I2C_INTERFACE_TUNER, gating); } EXPORT_SYMBOL(dib3000mc_get_tuner_i2c_master); static int dib3000mc_get_frontend(struct dvb_frontend* fe, struct dtv_frontend_properties *fep) { struct dib3000mc_state *state = fe->demodulator_priv; u16 tps = dib3000mc_read_word(state,458); fep->inversion = INVERSION_AUTO; fep->bandwidth_hz = state->current_bandwidth; switch ((tps >> 8) & 0x1) { case 0: fep->transmission_mode = TRANSMISSION_MODE_2K; break; case 1: fep->transmission_mode = TRANSMISSION_MODE_8K; break; } switch (tps & 0x3) { case 0: fep->guard_interval = GUARD_INTERVAL_1_32; break; case 1: fep->guard_interval = GUARD_INTERVAL_1_16; break; case 2: fep->guard_interval = GUARD_INTERVAL_1_8; break; case 3: fep->guard_interval = GUARD_INTERVAL_1_4; break; } switch ((tps >> 13) & 0x3) { case 0: fep->modulation = QPSK; break; case 1: fep->modulation = QAM_16; break; case 2: default: fep->modulation = QAM_64; break; } /* as long as the frontend_param structure is fixed for hierarchical transmission I refuse to use it */ /* (tps >> 12) & 0x1 == hrch is used, (tps >> 9) & 0x7 == alpha */ fep->hierarchy = HIERARCHY_NONE; switch ((tps >> 5) & 0x7) { case 1: fep->code_rate_HP = FEC_1_2; break; case 2: fep->code_rate_HP = FEC_2_3; break; case 3: fep->code_rate_HP = FEC_3_4; break; case 5: fep->code_rate_HP = FEC_5_6; break; case 7: default: fep->code_rate_HP = FEC_7_8; break; } switch ((tps >> 2) & 0x7) { case 1: fep->code_rate_LP = FEC_1_2; break; case 2: fep->code_rate_LP = FEC_2_3; break; case 3: fep->code_rate_LP = FEC_3_4; break; case 5: fep->code_rate_LP = FEC_5_6; break; case 7: default: fep->code_rate_LP = FEC_7_8; break; } return 0; } static int dib3000mc_set_frontend(struct dvb_frontend *fe) { struct dtv_frontend_properties *fep = &fe->dtv_property_cache; struct dib3000mc_state *state = fe->demodulator_priv; int ret; dib3000mc_set_output_mode(state, OUTMODE_HIGH_Z); state->current_bandwidth = fep->bandwidth_hz; dib3000mc_set_bandwidth(state, BANDWIDTH_TO_KHZ(fep->bandwidth_hz)); /* maybe the parameter has been changed */ state->sfn_workaround_active = buggy_sfn_workaround; if (fe->ops.tuner_ops.set_params) { fe->ops.tuner_ops.set_params(fe); msleep(100); } if (fep->transmission_mode == TRANSMISSION_MODE_AUTO || fep->guard_interval == GUARD_INTERVAL_AUTO || fep->modulation == QAM_AUTO || fep->code_rate_HP == FEC_AUTO) { int i = 1000, found; dib3000mc_autosearch_start(fe); do { msleep(1); found = dib3000mc_autosearch_is_irq(fe); } while (found == 0 && i--); dprintk("autosearch returns: %d\n",found); if (found == 0 || found == 1) return 0; // no channel found dib3000mc_get_frontend(fe, fep); } ret = dib3000mc_tune(fe); /* make this a config parameter */ dib3000mc_set_output_mode(state, OUTMODE_MPEG2_FIFO); return ret; } static int dib3000mc_read_status(struct dvb_frontend *fe, enum fe_status *stat) { struct dib3000mc_state *state = fe->demodulator_priv; u16 lock = dib3000mc_read_word(state, 509); *stat = 0; if (lock & 0x8000) *stat |= FE_HAS_SIGNAL; if (lock & 0x3000) *stat |= FE_HAS_CARRIER; if (lock & 0x0100) *stat |= FE_HAS_VITERBI; if (lock & 0x0010) *stat |= FE_HAS_SYNC; if (lock & 0x0008) *stat |= FE_HAS_LOCK; return 0; } static int dib3000mc_read_ber(struct dvb_frontend *fe, u32 *ber) { struct dib3000mc_state *state = fe->demodulator_priv; *ber = (dib3000mc_read_word(state, 500) << 16) | dib3000mc_read_word(state, 501); return 0; } static int dib3000mc_read_unc_blocks(struct dvb_frontend *fe, u32 *unc) { struct dib3000mc_state *state = fe->demodulator_priv; *unc = dib3000mc_read_word(state, 508); return 0; } static int dib3000mc_read_signal_strength(struct dvb_frontend *fe, u16 *strength) { struct dib3000mc_state *state = fe->demodulator_priv; u16 val = dib3000mc_read_word(state, 392); *strength = 65535 - val; return 0; } static int dib3000mc_read_snr(struct dvb_frontend* fe, u16 *snr) { *snr = 0x0000; return 0; } static int dib3000mc_fe_get_tune_settings(struct dvb_frontend* fe, struct dvb_frontend_tune_settings *tune) { tune->min_delay_ms = 1000; return 0; } static void dib3000mc_release(struct dvb_frontend *fe) { struct dib3000mc_state *state = fe->demodulator_priv; dibx000_exit_i2c_master(&state->i2c_master); kfree(state); } int dib3000mc_pid_control(struct dvb_frontend *fe, int index, int pid,int onoff) { struct dib3000mc_state *state = fe->demodulator_priv; dib3000mc_write_word(state, 212 + index, onoff ? (1 << 13) | pid : 0); return 0; } EXPORT_SYMBOL(dib3000mc_pid_control); int dib3000mc_pid_parse(struct dvb_frontend *fe, int onoff) { struct dib3000mc_state *state = fe->demodulator_priv; u16 tmp = dib3000mc_read_word(state, 206) & ~(1 << 4); tmp |= (onoff << 4); return dib3000mc_write_word(state, 206, tmp); } EXPORT_SYMBOL(dib3000mc_pid_parse); void dib3000mc_set_config(struct dvb_frontend *fe, struct dib3000mc_config *cfg) { struct dib3000mc_state *state = fe->demodulator_priv; state->cfg = cfg; } EXPORT_SYMBOL(dib3000mc_set_config); int dib3000mc_i2c_enumeration(struct i2c_adapter *i2c, int no_of_demods, u8 default_addr, struct dib3000mc_config cfg[]) { struct dib3000mc_state *dmcst; int k; u8 new_addr; static const u8 DIB3000MC_I2C_ADDRESS[] = { 20, 22, 24, 26 }; dmcst = kzalloc(sizeof(struct dib3000mc_state), GFP_KERNEL); if (dmcst == NULL) return -ENOMEM; dmcst->i2c_adap = i2c; for (k = no_of_demods-1; k >= 0; k--) { dmcst->cfg = &cfg[k]; /* designated i2c address */ new_addr = DIB3000MC_I2C_ADDRESS[k]; dmcst->i2c_addr = new_addr; if (dib3000mc_identify(dmcst) != 0) { dmcst->i2c_addr = default_addr; if (dib3000mc_identify(dmcst) != 0) { dprintk("-E- DiB3000P/MC #%d: not identified\n", k); kfree(dmcst); return -ENODEV; } } dib3000mc_set_output_mode(dmcst, OUTMODE_MPEG2_PAR_CONT_CLK); // set new i2c address and force divstr (Bit 1) to value 0 (Bit 0) dib3000mc_write_word(dmcst, 1024, (new_addr << 3) | 0x1); dmcst->i2c_addr = new_addr; } for (k = 0; k < no_of_demods; k++) { dmcst->cfg = &cfg[k]; dmcst->i2c_addr = DIB3000MC_I2C_ADDRESS[k]; dib3000mc_write_word(dmcst, 1024, dmcst->i2c_addr << 3); /* turn off data output */ dib3000mc_set_output_mode(dmcst, OUTMODE_HIGH_Z); } kfree(dmcst); return 0; } EXPORT_SYMBOL(dib3000mc_i2c_enumeration); static const struct dvb_frontend_ops dib3000mc_ops; struct dvb_frontend * dib3000mc_attach(struct i2c_adapter *i2c_adap, u8 i2c_addr, struct dib3000mc_config *cfg) { struct dvb_frontend *demod; struct dib3000mc_state *st; st = kzalloc(sizeof(struct dib3000mc_state), GFP_KERNEL); if (st == NULL) return NULL; st->cfg = cfg; st->i2c_adap = i2c_adap; st->i2c_addr = i2c_addr; demod = &st->demod; demod->demodulator_priv = st; memcpy(&st->demod.ops, &dib3000mc_ops, sizeof(struct dvb_frontend_ops)); if (dib3000mc_identify(st) != 0) goto error; dibx000_init_i2c_master(&st->i2c_master, DIB3000MC, st->i2c_adap, st->i2c_addr); dib3000mc_write_word(st, 1037, 0x3130); return demod; error: kfree(st); return NULL; } EXPORT_SYMBOL_GPL(dib3000mc_attach); static const struct dvb_frontend_ops dib3000mc_ops = { .delsys = { SYS_DVBT }, .info = { .name = "DiBcom 3000MC/P", .frequency_min_hz = 44250 * kHz, .frequency_max_hz = 867250 * kHz, .frequency_stepsize_hz = 62500, .caps = FE_CAN_INVERSION_AUTO | FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_QAM_64 | FE_CAN_QAM_AUTO | FE_CAN_TRANSMISSION_MODE_AUTO | FE_CAN_GUARD_INTERVAL_AUTO | FE_CAN_RECOVER | FE_CAN_HIERARCHY_AUTO, }, .release = dib3000mc_release, .init = dib3000mc_init, .sleep = dib3000mc_sleep, .set_frontend = dib3000mc_set_frontend, .get_tune_settings = dib3000mc_fe_get_tune_settings, .get_frontend = dib3000mc_get_frontend, .read_status = dib3000mc_read_status, .read_ber = dib3000mc_read_ber, .read_signal_strength = dib3000mc_read_signal_strength, .read_snr = dib3000mc_read_snr, .read_ucblocks = dib3000mc_read_unc_blocks, }; MODULE_AUTHOR("Patrick Boettcher <patrick.boettcher@posteo.de>"); MODULE_DESCRIPTION("Driver for the DiBcom 3000MC/P COFDM demodulator"); MODULE_LICENSE("GPL");
23 23 22 23 23 23 21 2 22 19 3 22 22 10 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 // SPDX-License-Identifier: GPL-2.0-only /* * LED Class Core * * Copyright (C) 2005 John Lenz <lenz@cs.wisc.edu> * Copyright (C) 2005-2007 Richard Purdie <rpurdie@openedhand.com> */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/leds.h> #include <linux/list.h> #include <linux/module.h> #include <linux/property.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <uapi/linux/uleds.h> #include <linux/of.h> #include "leds.h" static DEFINE_MUTEX(leds_lookup_lock); static LIST_HEAD(leds_lookup_list); static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); /* no lock needed for this */ led_update_brightness(led_cdev); return sprintf(buf, "%u\n", led_cdev->brightness); } static ssize_t brightness_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { struct led_classdev *led_cdev = dev_get_drvdata(dev); unsigned long state; ssize_t ret; mutex_lock(&led_cdev->led_access); if (led_sysfs_is_disabled(led_cdev)) { ret = -EBUSY; goto unlock; } ret = kstrtoul(buf, 10, &state); if (ret) goto unlock; if (state == LED_OFF) led_trigger_remove(led_cdev); led_set_brightness(led_cdev, state); flush_work(&led_cdev->set_brightness_work); ret = size; unlock: mutex_unlock(&led_cdev->led_access); return ret; } static DEVICE_ATTR_RW(brightness); static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); return sprintf(buf, "%u\n", led_cdev->max_brightness); } static DEVICE_ATTR_RO(max_brightness); #ifdef CONFIG_LEDS_TRIGGERS static BIN_ATTR(trigger, 0644, led_trigger_read, led_trigger_write, 0); static struct bin_attribute *led_trigger_bin_attrs[] = { &bin_attr_trigger, NULL, }; static const struct attribute_group led_trigger_group = { .bin_attrs = led_trigger_bin_attrs, }; #endif static struct attribute *led_class_attrs[] = { &dev_attr_brightness.attr, &dev_attr_max_brightness.attr, NULL, }; static const struct attribute_group led_group = { .attrs = led_class_attrs, }; static const struct attribute_group *led_groups[] = { &led_group, #ifdef CONFIG_LEDS_TRIGGERS &led_trigger_group, #endif NULL, }; #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED static ssize_t brightness_hw_changed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->brightness_hw_changed == -1) return -ENODATA; return sprintf(buf, "%u\n", led_cdev->brightness_hw_changed); } static DEVICE_ATTR_RO(brightness_hw_changed); static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { struct device *dev = led_cdev->dev; int ret; ret = device_create_file(dev, &dev_attr_brightness_hw_changed); if (ret) { dev_err(dev, "Error creating brightness_hw_changed\n"); return ret; } led_cdev->brightness_hw_changed_kn = sysfs_get_dirent(dev->kobj.sd, "brightness_hw_changed"); if (!led_cdev->brightness_hw_changed_kn) { dev_err(dev, "Error getting brightness_hw_changed kn\n"); device_remove_file(dev, &dev_attr_brightness_hw_changed); return -ENXIO; } return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { sysfs_put(led_cdev->brightness_hw_changed_kn); device_remove_file(led_cdev->dev, &dev_attr_brightness_hw_changed); } void led_classdev_notify_brightness_hw_changed(struct led_classdev *led_cdev, unsigned int brightness) { if (WARN_ON(!led_cdev->brightness_hw_changed_kn)) return; led_cdev->brightness_hw_changed = brightness; sysfs_notify_dirent(led_cdev->brightness_hw_changed_kn); } EXPORT_SYMBOL_GPL(led_classdev_notify_brightness_hw_changed); #else static int led_add_brightness_hw_changed(struct led_classdev *led_cdev) { return 0; } static void led_remove_brightness_hw_changed(struct led_classdev *led_cdev) { } #endif /** * led_classdev_suspend - suspend an led_classdev. * @led_cdev: the led_classdev to suspend. */ void led_classdev_suspend(struct led_classdev *led_cdev) { led_cdev->flags |= LED_SUSPENDED; led_set_brightness_nopm(led_cdev, 0); flush_work(&led_cdev->set_brightness_work); } EXPORT_SYMBOL_GPL(led_classdev_suspend); /** * led_classdev_resume - resume an led_classdev. * @led_cdev: the led_classdev to resume. */ void led_classdev_resume(struct led_classdev *led_cdev) { led_set_brightness_nopm(led_cdev, led_cdev->brightness); if (led_cdev->flash_resume) led_cdev->flash_resume(led_cdev); led_cdev->flags &= ~LED_SUSPENDED; } EXPORT_SYMBOL_GPL(led_classdev_resume); #ifdef CONFIG_PM_SLEEP static int led_suspend(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_suspend(led_cdev); return 0; } static int led_resume(struct device *dev) { struct led_classdev *led_cdev = dev_get_drvdata(dev); if (led_cdev->flags & LED_CORE_SUSPENDRESUME) led_classdev_resume(led_cdev); return 0; } #endif static SIMPLE_DEV_PM_OPS(leds_class_dev_pm_ops, led_suspend, led_resume); static struct led_classdev *led_module_get(struct device *led_dev) { struct led_classdev *led_cdev; if (!led_dev) return ERR_PTR(-EPROBE_DEFER); led_cdev = dev_get_drvdata(led_dev); if (!try_module_get(led_cdev->dev->parent->driver->owner)) { put_device(led_cdev->dev); return ERR_PTR(-ENODEV); } return led_cdev; } static const struct class leds_class = { .name = "leds", .dev_groups = led_groups, .pm = &leds_class_dev_pm_ops, }; /** * of_led_get() - request a LED device via the LED framework * @np: device node to get the LED device from * @index: the index of the LED * * Returns the LED device parsed from the phandle specified in the "leds" * property of a device tree node or a negative error-code on failure. */ struct led_classdev *of_led_get(struct device_node *np, int index) { struct device *led_dev; struct device_node *led_node; led_node = of_parse_phandle(np, "leds", index); if (!led_node) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_of_node(&leds_class, led_node); of_node_put(led_node); return led_module_get(led_dev); } EXPORT_SYMBOL_GPL(of_led_get); /** * led_put() - release a LED device * @led_cdev: LED device */ void led_put(struct led_classdev *led_cdev) { module_put(led_cdev->dev->parent->driver->owner); put_device(led_cdev->dev); } EXPORT_SYMBOL_GPL(led_put); static void devm_led_release(struct device *dev, void *res) { struct led_classdev **p = res; led_put(*p); } static struct led_classdev *__devm_led_get(struct device *dev, struct led_classdev *led) { struct led_classdev **dr; dr = devres_alloc(devm_led_release, sizeof(struct led_classdev *), GFP_KERNEL); if (!dr) { led_put(led); return ERR_PTR(-ENOMEM); } *dr = led; devres_add(dev, dr); return led; } /** * devm_of_led_get - Resource-managed request of a LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parse to find the request LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *__must_check devm_of_led_get(struct device *dev, int index) { struct led_classdev *led; if (!dev) return ERR_PTR(-EINVAL); led = of_led_get(dev->of_node, index); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_of_led_get); /** * led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *led_get(struct device *dev, char *con_id) { struct led_lookup_data *lookup; const char *provider = NULL; struct device *led_dev; mutex_lock(&leds_lookup_lock); list_for_each_entry(lookup, &leds_lookup_list, list) { if (!strcmp(lookup->dev_id, dev_name(dev)) && !strcmp(lookup->con_id, con_id)) { provider = kstrdup_const(lookup->provider, GFP_KERNEL); break; } } mutex_unlock(&leds_lookup_lock); if (!provider) return ERR_PTR(-ENOENT); led_dev = class_find_device_by_name(&leds_class, provider); kfree_const(provider); return led_module_get(led_dev); } EXPORT_SYMBOL_GPL(led_get); /** * devm_led_get() - request a LED device via the LED framework * @dev: device for which to get the LED device * @con_id: name of the LED from the device's point of view * * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device or ERR_PTR(errno) on failure. */ struct led_classdev *devm_led_get(struct device *dev, char *con_id) { struct led_classdev *led; led = led_get(dev, con_id); if (IS_ERR(led)) return led; return __devm_led_get(dev, led); } EXPORT_SYMBOL_GPL(devm_led_get); /** * led_add_lookup() - Add a LED lookup table entry * @led_lookup: the lookup table entry to add * * Add a LED lookup table entry. On systems without devicetree the lookup table * is used by led_get() to find LEDs. */ void led_add_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_add_tail(&led_lookup->list, &leds_lookup_list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_add_lookup); /** * led_remove_lookup() - Remove a LED lookup table entry * @led_lookup: the lookup table entry to remove */ void led_remove_lookup(struct led_lookup_data *led_lookup) { mutex_lock(&leds_lookup_lock); list_del(&led_lookup->list); mutex_unlock(&leds_lookup_lock); } EXPORT_SYMBOL_GPL(led_remove_lookup); /** * devm_of_led_get_optional - Resource-managed request of an optional LED device * @dev: LED consumer * @index: index of the LED to obtain in the consumer * * The device node of the device is parsed to find the requested LED device. * The LED device returned from this function is automatically released * on driver detach. * * @return a pointer to a LED device, ERR_PTR(errno) on failure and NULL if the * led was not found. */ struct led_classdev *__must_check devm_of_led_get_optional(struct device *dev, int index) { struct led_classdev *led; led = devm_of_led_get(dev, index); if (IS_ERR(led) && PTR_ERR(led) == -ENOENT) return NULL; return led; } EXPORT_SYMBOL_GPL(devm_of_led_get_optional); static int led_classdev_next_name(const char *init_name, char *name, size_t len) { unsigned int i = 0; int ret = 0; struct device *dev; strscpy(name, init_name, len); while ((ret < len) && (dev = class_find_device_by_name(&leds_class, name))) { put_device(dev); ret = snprintf(name, len, "%s_%u", init_name, ++i); } if (ret >= len) return -ENOMEM; return i; } /** * led_classdev_register_ext - register a new object of led_classdev class * with init data. * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { char composed_name[LED_MAX_NAME_SIZE]; char final_name[LED_MAX_NAME_SIZE]; const char *proposed_name = composed_name; int ret; if (init_data) { if (init_data->devname_mandatory && !init_data->devicename) { dev_err(parent, "Mandatory device name is missing"); return -EINVAL; } ret = led_compose_name(parent, init_data, composed_name); if (ret < 0) return ret; if (init_data->fwnode) { fwnode_property_read_string(init_data->fwnode, "linux,default-trigger", &led_cdev->default_trigger); if (fwnode_property_present(init_data->fwnode, "retain-state-shutdown")) led_cdev->flags |= LED_RETAIN_AT_SHUTDOWN; fwnode_property_read_u32(init_data->fwnode, "max-brightness", &led_cdev->max_brightness); if (fwnode_property_present(init_data->fwnode, "color")) fwnode_property_read_u32(init_data->fwnode, "color", &led_cdev->color); } } else { proposed_name = led_cdev->name; } ret = led_classdev_next_name(proposed_name, final_name, sizeof(final_name)); if (ret < 0) return ret; else if (ret && led_cdev->flags & LED_REJECT_NAME_CONFLICT) return -EEXIST; else if (ret) dev_warn(parent, "Led %s renamed to %s due to name collision\n", proposed_name, final_name); if (led_cdev->color >= LED_COLOR_ID_MAX) dev_warn(parent, "LED %s color identifier out of range\n", final_name); mutex_init(&led_cdev->led_access); mutex_lock(&led_cdev->led_access); led_cdev->dev = device_create_with_groups(&leds_class, parent, 0, led_cdev, led_cdev->groups, "%s", final_name); if (IS_ERR(led_cdev->dev)) { mutex_unlock(&led_cdev->led_access); return PTR_ERR(led_cdev->dev); } if (init_data && init_data->fwnode) device_set_node(led_cdev->dev, init_data->fwnode); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) { ret = led_add_brightness_hw_changed(led_cdev); if (ret) { device_unregister(led_cdev->dev); led_cdev->dev = NULL; mutex_unlock(&led_cdev->led_access); return ret; } } led_cdev->work_flags = 0; #ifdef CONFIG_LEDS_TRIGGERS init_rwsem(&led_cdev->trigger_lock); #endif #ifdef CONFIG_LEDS_BRIGHTNESS_HW_CHANGED led_cdev->brightness_hw_changed = -1; #endif /* add to the list of leds */ down_write(&leds_list_lock); list_add_tail(&led_cdev->node, &leds_list); up_write(&leds_list_lock); if (!led_cdev->max_brightness) led_cdev->max_brightness = LED_FULL; led_update_brightness(led_cdev); led_init_core(led_cdev); #ifdef CONFIG_LEDS_TRIGGERS led_trigger_set_default(led_cdev); #endif mutex_unlock(&led_cdev->led_access); dev_dbg(parent, "Registered led device: %s\n", led_cdev->name); return 0; } EXPORT_SYMBOL_GPL(led_classdev_register_ext); /** * led_classdev_unregister - unregisters a object of led_properties class. * @led_cdev: the led device to unregister * * Unregisters a previously registered via led_classdev_register object. */ void led_classdev_unregister(struct led_classdev *led_cdev) { if (IS_ERR_OR_NULL(led_cdev->dev)) return; #ifdef CONFIG_LEDS_TRIGGERS down_write(&led_cdev->trigger_lock); if (led_cdev->trigger) led_trigger_set(led_cdev, NULL); up_write(&led_cdev->trigger_lock); #endif led_cdev->flags |= LED_UNREGISTERING; /* Stop blinking */ led_stop_software_blink(led_cdev); if (!(led_cdev->flags & LED_RETAIN_AT_SHUTDOWN)) led_set_brightness(led_cdev, LED_OFF); flush_work(&led_cdev->set_brightness_work); if (led_cdev->flags & LED_BRIGHT_HW_CHANGED) led_remove_brightness_hw_changed(led_cdev); device_unregister(led_cdev->dev); down_write(&leds_list_lock); list_del(&led_cdev->node); up_write(&leds_list_lock); mutex_destroy(&led_cdev->led_access); } EXPORT_SYMBOL_GPL(led_classdev_unregister); static void devm_led_classdev_release(struct device *dev, void *res) { led_classdev_unregister(*(struct led_classdev **)res); } /** * devm_led_classdev_register_ext - resource managed led_classdev_register_ext() * * @parent: parent of LED device * @led_cdev: the led_classdev structure for this device. * @init_data: LED class device initialization data */ int devm_led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data) { struct led_classdev **dr; int rc; dr = devres_alloc(devm_led_classdev_release, sizeof(*dr), GFP_KERNEL); if (!dr) return -ENOMEM; rc = led_classdev_register_ext(parent, led_cdev, init_data); if (rc) { devres_free(dr); return rc; } *dr = led_cdev; devres_add(parent, dr); return 0; } EXPORT_SYMBOL_GPL(devm_led_classdev_register_ext); static int devm_led_classdev_match(struct device *dev, void *res, void *data) { struct led_classdev **p = res; if (WARN_ON(!p || !*p)) return 0; return *p == data; } /** * devm_led_classdev_unregister() - resource managed led_classdev_unregister() * @dev: The device to unregister. * @led_cdev: the led_classdev structure for this device. */ void devm_led_classdev_unregister(struct device *dev, struct led_classdev *led_cdev) { WARN_ON(devres_release(dev, devm_led_classdev_release, devm_led_classdev_match, led_cdev)); } EXPORT_SYMBOL_GPL(devm_led_classdev_unregister); static int __init leds_init(void) { return class_register(&leds_class); } static void __exit leds_exit(void) { class_unregister(&leds_class); } subsys_initcall(leds_init); module_exit(leds_exit); MODULE_AUTHOR("John Lenz, Richard Purdie"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LED Class Interface");
78 78 77 15 78 77 1 1 9 70 78 18 18 78 78 61 20 2 18 2 76 77 40 38 4 4 71 71 71 61 76 46 77 60 1 16 46 29 23 71 1 1 1 70 58 70 11 11 11 11 7 11 8 4 4 4 11 11 10 11 8 8 72 72 73 72 10 45 67 7 7 70 70 62 7 69 69 1 70 1 70 61 11 70 72 70 41 43 69 8 69 17 38 13 13 36 41 24 6 3 9 2 7 10 10 8 2 1 1 1 1 8 8 9 8 9 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 6 6 6 6 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2005 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ /* * jfs_txnmgr.c: transaction manager * * notes: * transaction starts with txBegin() and ends with txCommit() * or txAbort(). * * tlock is acquired at the time of update; * (obviate scan at commit time for xtree and dtree) * tlock and mp points to each other; * (no hashlist for mp -> tlock). * * special cases: * tlock on in-memory inode: * in-place tlock in the in-memory inode itself; * converted to page lock by iWrite() at commit time. * * tlock during write()/mmap() under anonymous transaction (tid = 0): * transferred (?) to transaction at commit time. * * use the page itself to update allocation maps * (obviate intermediate replication of allocation/deallocation data) * hold on to mp+lock thru update of maps */ #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/completion.h> #include <linux/freezer.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/seq_file.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dinode.h" #include "jfs_imap.h" #include "jfs_dmap.h" #include "jfs_superblock.h" #include "jfs_debug.h" /* * transaction management structures */ static struct { int freetid; /* index of a free tid structure */ int freelock; /* index first free lock word */ wait_queue_head_t freewait; /* eventlist of free tblock */ wait_queue_head_t freelockwait; /* eventlist of free tlock */ wait_queue_head_t lowlockwait; /* eventlist of ample tlocks */ int tlocksInUse; /* Number of tlocks in use */ spinlock_t LazyLock; /* synchronize sync_queue & unlock_queue */ /* struct tblock *sync_queue; * Transactions waiting for data sync */ struct list_head unlock_queue; /* Txns waiting to be released */ struct list_head anon_list; /* inodes having anonymous txns */ struct list_head anon_list2; /* inodes having anonymous txns that couldn't be sync'ed */ } TxAnchor; int jfs_tlocks_low; /* Indicates low number of available tlocks */ #ifdef CONFIG_JFS_STATISTICS static struct { uint txBegin; uint txBegin_barrier; uint txBegin_lockslow; uint txBegin_freetid; uint txBeginAnon; uint txBeginAnon_barrier; uint txBeginAnon_lockslow; uint txLockAlloc; uint txLockAlloc_freelock; } TxStat; #endif static int nTxBlock = -1; /* number of transaction blocks */ module_param(nTxBlock, int, 0); MODULE_PARM_DESC(nTxBlock, "Number of transaction blocks (max:65536)"); static int nTxLock = -1; /* number of transaction locks */ module_param(nTxLock, int, 0); MODULE_PARM_DESC(nTxLock, "Number of transaction locks (max:65536)"); struct tblock *TxBlock; /* transaction block table */ static int TxLockLWM; /* Low water mark for number of txLocks used */ static int TxLockHWM; /* High water mark for number of txLocks used */ static int TxLockVHWM; /* Very High water mark */ struct tlock *TxLock; /* transaction lock table */ /* * transaction management lock */ static DEFINE_SPINLOCK(jfsTxnLock); #define TXN_LOCK() spin_lock(&jfsTxnLock) #define TXN_UNLOCK() spin_unlock(&jfsTxnLock) #define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock) #define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) #define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait); static int jfs_commit_thread_waking; /* * Retry logic exist outside these macros to protect from spurrious wakeups. */ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(event, &wait); set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); io_schedule(); remove_wait_queue(event, &wait); } #define TXN_SLEEP(event)\ {\ TXN_SLEEP_DROP_LOCK(event);\ TXN_LOCK();\ } #define TXN_WAKEUP(event) wake_up_all(event) /* * statistics */ static struct { tid_t maxtid; /* 4: biggest tid ever used */ lid_t maxlid; /* 4: biggest lid ever used */ int ntid; /* 4: # of transactions performed */ int nlid; /* 4: # of tlocks acquired */ int waitlock; /* 4: # of tlock wait */ } stattx; /* * forward references */ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck, struct commit *cd); static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck); static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk); static void txForce(struct tblock * tblk); static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd); static void txUpdateMap(struct tblock * tblk); static void txRelease(struct tblock * tblk); static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void LogSyncRelease(struct metapage * mp); /* * transaction block/lock management * --------------------------------- */ /* * Get a transaction lock from the free list. If the number in use is * greater than the high water mark, wake up the sync daemon. This should * free some anonymous transaction locks. (TXN_LOCK must be held.) */ static lid_t txLockAlloc(void) { lid_t lid; INCREMENT(TxStat.txLockAlloc); if (!TxAnchor.freelock) { INCREMENT(TxStat.txLockAlloc_freelock); } while (!(lid = TxAnchor.freelock)) TXN_SLEEP(&TxAnchor.freelockwait); TxAnchor.freelock = TxLock[lid].next; HIGHWATERMARK(stattx.maxlid, lid); if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) { jfs_info("txLockAlloc tlocks low"); jfs_tlocks_low = 1; wake_up_process(jfsSyncThread); } return lid; } static void txLockFree(lid_t lid) { TxLock[lid].tid = 0; TxLock[lid].next = TxAnchor.freelock; TxAnchor.freelock = lid; TxAnchor.tlocksInUse--; if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) { jfs_info("txLockFree jfs_tlocks_low no more"); jfs_tlocks_low = 0; TXN_WAKEUP(&TxAnchor.lowlockwait); } TXN_WAKEUP(&TxAnchor.freelockwait); } /* * NAME: txInit() * * FUNCTION: initialize transaction management structures * * RETURN: * * serialization: single thread at jfs_init() */ int txInit(void) { int k, size; struct sysinfo si; /* Set defaults for nTxLock and nTxBlock if unset */ if (nTxLock == -1) { if (nTxBlock == -1) { /* Base default on memory size */ si_meminfo(&si); if (si.totalram > (256 * 1024)) /* 1 GB */ nTxLock = 64 * 1024; else nTxLock = si.totalram >> 2; } else if (nTxBlock > (8 * 1024)) nTxLock = 64 * 1024; else nTxLock = nTxBlock << 3; } if (nTxBlock == -1) nTxBlock = nTxLock >> 3; /* Verify tunable parameters */ if (nTxBlock < 16) nTxBlock = 16; /* No one should set it this low */ if (nTxBlock > 65536) nTxBlock = 65536; if (nTxLock < 256) nTxLock = 256; /* No one should set it this low */ if (nTxLock > 65536) nTxLock = 65536; printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n", nTxBlock, nTxLock); /* * initialize transaction block (tblock) table * * transaction id (tid) = tblock index * tid = 0 is reserved. */ TxLockLWM = (nTxLock * 4) / 10; TxLockHWM = (nTxLock * 7) / 10; TxLockVHWM = (nTxLock * 8) / 10; size = sizeof(struct tblock) * nTxBlock; TxBlock = vmalloc(size); if (TxBlock == NULL) return -ENOMEM; for (k = 1; k < nTxBlock - 1; k++) { TxBlock[k].next = k + 1; init_waitqueue_head(&TxBlock[k].gcwait); init_waitqueue_head(&TxBlock[k].waitor); } TxBlock[k].next = 0; init_waitqueue_head(&TxBlock[k].gcwait); init_waitqueue_head(&TxBlock[k].waitor); TxAnchor.freetid = 1; init_waitqueue_head(&TxAnchor.freewait); stattx.maxtid = 1; /* statistics */ /* * initialize transaction lock (tlock) table * * transaction lock id = tlock index * tlock id = 0 is reserved. */ size = sizeof(struct tlock) * nTxLock; TxLock = vmalloc(size); if (TxLock == NULL) { vfree(TxBlock); return -ENOMEM; } /* initialize tlock table */ for (k = 1; k < nTxLock - 1; k++) TxLock[k].next = k + 1; TxLock[k].next = 0; init_waitqueue_head(&TxAnchor.freelockwait); init_waitqueue_head(&TxAnchor.lowlockwait); TxAnchor.freelock = 1; TxAnchor.tlocksInUse = 0; INIT_LIST_HEAD(&TxAnchor.anon_list); INIT_LIST_HEAD(&TxAnchor.anon_list2); LAZY_LOCK_INIT(); INIT_LIST_HEAD(&TxAnchor.unlock_queue); stattx.maxlid = 1; /* statistics */ return 0; } /* * NAME: txExit() * * FUNCTION: clean up when module is unloaded */ void txExit(void) { vfree(TxLock); TxLock = NULL; vfree(TxBlock); TxBlock = NULL; } /* * NAME: txBegin() * * FUNCTION: start a transaction. * * PARAMETER: sb - superblock * flag - force for nested tx; * * RETURN: tid - transaction id * * note: flag force allows to start tx for nested tx * to prevent deadlock on logsync barrier; */ tid_t txBegin(struct super_block *sb, int flag) { tid_t t; struct tblock *tblk; struct jfs_log *log; jfs_info("txBegin: flag = 0x%x", flag); log = JFS_SBI(sb)->log; if (!log) { jfs_error(sb, "read-only filesystem\n"); return 0; } TXN_LOCK(); INCREMENT(TxStat.txBegin); retry: if (!(flag & COMMIT_FORCE)) { /* * synchronize with logsync barrier */ if (test_bit(log_SYNCBARRIER, &log->flag) || test_bit(log_QUIESCE, &log->flag)) { INCREMENT(TxStat.txBegin_barrier); TXN_SLEEP(&log->syncwait); goto retry; } } if (flag == 0) { /* * Don't begin transaction if we're getting starved for tlocks * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately * free tlocks) */ if (TxAnchor.tlocksInUse > TxLockVHWM) { INCREMENT(TxStat.txBegin_lockslow); TXN_SLEEP(&TxAnchor.lowlockwait); goto retry; } } /* * allocate transaction id/block */ if ((t = TxAnchor.freetid) == 0) { jfs_info("txBegin: waiting for free tid"); INCREMENT(TxStat.txBegin_freetid); TXN_SLEEP(&TxAnchor.freewait); goto retry; } tblk = tid_to_tblock(t); if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) { /* Don't let a non-forced transaction take the last tblk */ jfs_info("txBegin: waiting for free tid"); INCREMENT(TxStat.txBegin_freetid); TXN_SLEEP(&TxAnchor.freewait); goto retry; } TxAnchor.freetid = tblk->next; /* * initialize transaction */ /* * We can't zero the whole thing or we screw up another thread being * awakened after sleeping on tblk->waitor * * memset(tblk, 0, sizeof(struct tblock)); */ tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0; tblk->sb = sb; ++log->logtid; tblk->logtid = log->logtid; ++log->active; HIGHWATERMARK(stattx.maxtid, t); /* statistics */ INCREMENT(stattx.ntid); /* statistics */ TXN_UNLOCK(); jfs_info("txBegin: returning tid = %d", t); return t; } /* * NAME: txBeginAnon() * * FUNCTION: start an anonymous transaction. * Blocks if logsync or available tlocks are low to prevent * anonymous tlocks from depleting supply. * * PARAMETER: sb - superblock * * RETURN: none */ void txBeginAnon(struct super_block *sb) { struct jfs_log *log; log = JFS_SBI(sb)->log; TXN_LOCK(); INCREMENT(TxStat.txBeginAnon); retry: /* * synchronize with logsync barrier */ if (test_bit(log_SYNCBARRIER, &log->flag) || test_bit(log_QUIESCE, &log->flag)) { INCREMENT(TxStat.txBeginAnon_barrier); TXN_SLEEP(&log->syncwait); goto retry; } /* * Don't begin transaction if we're getting starved for tlocks */ if (TxAnchor.tlocksInUse > TxLockVHWM) { INCREMENT(TxStat.txBeginAnon_lockslow); TXN_SLEEP(&TxAnchor.lowlockwait); goto retry; } TXN_UNLOCK(); } /* * txEnd() * * function: free specified transaction block. * * logsync barrier processing: * * serialization: */ void txEnd(tid_t tid) { struct tblock *tblk = tid_to_tblock(tid); struct jfs_log *log; jfs_info("txEnd: tid = %d", tid); TXN_LOCK(); /* * wakeup transactions waiting on the page locked * by the current transaction */ TXN_WAKEUP(&tblk->waitor); log = JFS_SBI(tblk->sb)->log; /* * Lazy commit thread can't free this guy until we mark it UNLOCKED, * otherwise, we would be left with a transaction that may have been * reused. * * Lazy commit thread will turn off tblkGC_LAZY before calling this * routine. */ if (tblk->flag & tblkGC_LAZY) { jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk); TXN_UNLOCK(); spin_lock_irq(&log->gclock); // LOGGC_LOCK tblk->flag |= tblkGC_UNLOCKED; spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK return; } jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk); assert(tblk->next == 0); /* * insert tblock back on freelist */ tblk->next = TxAnchor.freetid; TxAnchor.freetid = tid; /* * mark the tblock not active */ if (--log->active == 0) { clear_bit(log_FLUSH, &log->flag); /* * synchronize with logsync barrier */ if (test_bit(log_SYNCBARRIER, &log->flag)) { TXN_UNLOCK(); /* write dirty metadata & forward log syncpt */ jfs_syncpt(log, 1); jfs_info("log barrier off: 0x%x", log->lsn); /* enable new transactions start */ clear_bit(log_SYNCBARRIER, &log->flag); /* wakeup all waitors for logsync barrier */ TXN_WAKEUP(&log->syncwait); goto wakeup; } } TXN_UNLOCK(); wakeup: /* * wakeup all waitors for a free tblock */ TXN_WAKEUP(&TxAnchor.freewait); } /* * txLock() * * function: acquire a transaction lock on the specified <mp> * * parameter: * * return: transaction lock id * * serialization: */ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, int type) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); int dir_xtree = 0; lid_t lid; tid_t xtid; struct tlock *tlck; struct xtlock *xtlck; struct linelock *linelock; xtpage_t *p; struct tblock *tblk; TXN_LOCK(); if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) && !(mp->xflag & COMMIT_PAGE)) { /* * Directory inode is special. It can have both an xtree tlock * and a dtree tlock associated with it. */ dir_xtree = 1; lid = jfs_ip->xtlid; } else lid = mp->lid; /* is page not locked by a transaction ? */ if (lid == 0) goto allocateLock; jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid); /* is page locked by the requester transaction ? */ tlck = lid_to_tlock(lid); if ((xtid = tlck->tid) == tid) { TXN_UNLOCK(); goto grantLock; } /* * is page locked by anonymous transaction/lock ? * * (page update without transaction (i.e., file write) is * locked under anonymous transaction tid = 0: * anonymous tlocks maintained on anonymous tlock list of * the inode of the page and available to all anonymous * transactions until txCommit() time at which point * they are transferred to the transaction tlock list of * the committing transaction of the inode) */ if (xtid == 0) { tlck->tid = tid; TXN_UNLOCK(); tblk = tid_to_tblock(tid); /* * The order of the tlocks in the transaction is important * (during truncate, child xtree pages must be freed before * parent's tlocks change the working map). * Take tlock off anonymous list and add to tail of * transaction list * * Note: We really need to get rid of the tid & lid and * use list_head's. This code is getting UGLY! */ if (jfs_ip->atlhead == lid) { if (jfs_ip->atltail == lid) { /* only anonymous txn. * Remove from anon_list */ TXN_LOCK(); list_del_init(&jfs_ip->anon_inode_list); TXN_UNLOCK(); } jfs_ip->atlhead = tlck->next; } else { lid_t last; for (last = jfs_ip->atlhead; lid_to_tlock(last)->next != lid; last = lid_to_tlock(last)->next) { assert(last); } lid_to_tlock(last)->next = tlck->next; if (jfs_ip->atltail == lid) jfs_ip->atltail = last; } /* insert the tlock at tail of transaction tlock list */ if (tblk->next) lid_to_tlock(tblk->last)->next = lid; else tblk->next = lid; tlck->next = 0; tblk->last = lid; goto grantLock; } goto waitLock; /* * allocate a tlock */ allocateLock: lid = txLockAlloc(); tlck = lid_to_tlock(lid); /* * initialize tlock */ tlck->tid = tid; TXN_UNLOCK(); /* mark tlock for meta-data page */ if (mp->xflag & COMMIT_PAGE) { tlck->flag = tlckPAGELOCK; /* mark the page dirty and nohomeok */ metapage_nohomeok(mp); jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p", mp, mp->nohomeok, tid, tlck); /* if anonymous transaction, and buffer is on the group * commit synclist, mark inode to show this. This will * prevent the buffer from being marked nohomeok for too * long a time. */ if ((tid == 0) && mp->lsn) set_cflag(COMMIT_Synclist, ip); } /* mark tlock for in-memory inode */ else tlck->flag = tlckINODELOCK; if (S_ISDIR(ip->i_mode)) tlck->flag |= tlckDIRECTORY; tlck->type = 0; /* bind the tlock and the page */ tlck->ip = ip; tlck->mp = mp; if (dir_xtree) jfs_ip->xtlid = lid; else mp->lid = lid; /* * enqueue transaction lock to transaction/inode */ /* insert the tlock at tail of transaction tlock list */ if (tid) { tblk = tid_to_tblock(tid); if (tblk->next) lid_to_tlock(tblk->last)->next = lid; else tblk->next = lid; tlck->next = 0; tblk->last = lid; } /* anonymous transaction: * insert the tlock at head of inode anonymous tlock list */ else { tlck->next = jfs_ip->atlhead; jfs_ip->atlhead = lid; if (tlck->next == 0) { /* This inode's first anonymous transaction */ jfs_ip->atltail = lid; TXN_LOCK(); list_add_tail(&jfs_ip->anon_inode_list, &TxAnchor.anon_list); TXN_UNLOCK(); } } /* initialize type dependent area for linelock */ linelock = (struct linelock *) & tlck->lock; linelock->next = 0; linelock->flag = tlckLINELOCK; linelock->maxcnt = TLOCKSHORT; linelock->index = 0; switch (type & tlckTYPE) { case tlckDTREE: linelock->l2linesize = L2DTSLOTSIZE; break; case tlckXTREE: linelock->l2linesize = L2XTSLOTSIZE; xtlck = (struct xtlock *) linelock; xtlck->header.offset = 0; xtlck->header.length = 2; if (type & tlckNEW) { xtlck->lwm.offset = XTENTRYSTART; } else { if (mp->xflag & COMMIT_PAGE) p = (xtpage_t *) mp->data; else p = (xtpage_t *) &jfs_ip->i_xtroot; xtlck->lwm.offset = le16_to_cpu(p->header.nextindex); } xtlck->lwm.length = 0; /* ! */ xtlck->twm.offset = 0; xtlck->hwm.offset = 0; xtlck->index = 2; break; case tlckINODE: linelock->l2linesize = L2INODESLOTSIZE; break; case tlckDATA: linelock->l2linesize = L2DATASLOTSIZE; break; default: jfs_err("UFO tlock:0x%p", tlck); } /* * update tlock vector */ grantLock: tlck->type |= type; return tlck; /* * page is being locked by another transaction: */ waitLock: /* Only locks on ipimap or ipaimap should reach here */ /* assert(jfs_ip->fileset == AGGREGATE_I); */ if (jfs_ip->fileset != AGGREGATE_I) { printk(KERN_ERR "txLock: trying to lock locked page!"); print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4, ip, sizeof(*ip), 0); print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4, mp, sizeof(*mp), 0); print_hex_dump(KERN_ERR, "Locker's tblock: ", DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid), sizeof(struct tblock), 0); print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4, tlck, sizeof(*tlck), 0); BUG(); } INCREMENT(stattx.waitlock); /* statistics */ TXN_UNLOCK(); release_metapage(mp); TXN_LOCK(); xtid = tlck->tid; /* reacquire after dropping TXN_LOCK */ jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d", tid, xtid, lid); /* Recheck everything since dropping TXN_LOCK */ if (xtid && (tlck->mp == mp) && (mp->lid == lid)) TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor); else TXN_UNLOCK(); jfs_info("txLock: awakened tid = %d, lid = %d", tid, lid); return NULL; } /* * NAME: txRelease() * * FUNCTION: Release buffers associated with transaction locks, but don't * mark homeok yet. The allows other transactions to modify * buffers, but won't let them go to disk until commit record * actually gets written. * * PARAMETER: * tblk - * * RETURN: Errors from subroutines. */ static void txRelease(struct tblock * tblk) { struct metapage *mp; lid_t lid; struct tlock *tlck; TXN_LOCK(); for (lid = tblk->next; lid; lid = tlck->next) { tlck = lid_to_tlock(lid); if ((mp = tlck->mp) != NULL && (tlck->type & tlckBTROOT) == 0) { assert(mp->xflag & COMMIT_PAGE); mp->lid = 0; } } /* * wakeup transactions waiting on a page locked * by the current transaction */ TXN_WAKEUP(&tblk->waitor); TXN_UNLOCK(); } /* * NAME: txUnlock() * * FUNCTION: Initiates pageout of pages modified by tid in journalled * objects and frees their lockwords. */ static void txUnlock(struct tblock * tblk) { struct tlock *tlck; struct linelock *linelock; lid_t lid, next, llid, k; struct metapage *mp; struct jfs_log *log; int difft, diffp; unsigned long flags; jfs_info("txUnlock: tblk = 0x%p", tblk); log = JFS_SBI(tblk->sb)->log; /* * mark page under tlock homeok (its log has been written): */ for (lid = tblk->next; lid; lid = next) { tlck = lid_to_tlock(lid); next = tlck->next; jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck); /* unbind page from tlock */ if ((mp = tlck->mp) != NULL && (tlck->type & tlckBTROOT) == 0) { assert(mp->xflag & COMMIT_PAGE); /* hold buffer */ hold_metapage(mp); assert(mp->nohomeok > 0); _metapage_homeok(mp); /* inherit younger/larger clsn */ LOGSYNC_LOCK(log, flags); if (mp->clsn) { logdiff(difft, tblk->clsn, log); logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; } else mp->clsn = tblk->clsn; LOGSYNC_UNLOCK(log, flags); assert(!(tlck->flag & tlckFREEPAGE)); put_metapage(mp); } /* insert tlock, and linelock(s) of the tlock if any, * at head of freelist */ TXN_LOCK(); llid = ((struct linelock *) & tlck->lock)->next; while (llid) { linelock = (struct linelock *) lid_to_tlock(llid); k = linelock->next; txLockFree(llid); llid = k; } txLockFree(lid); TXN_UNLOCK(); } tblk->next = tblk->last = 0; /* * remove tblock from logsynclist * (allocation map pages inherited lsn of tblk and * has been inserted in logsync list at txUpdateMap()) */ if (tblk->lsn) { LOGSYNC_LOCK(log, flags); log->count--; list_del(&tblk->synclist); LOGSYNC_UNLOCK(log, flags); } } /* * txMaplock() * * function: allocate a transaction lock for freed page/entry; * for freed page, maplock is used as xtlock/dtlock type; */ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); lid_t lid; struct tblock *tblk; struct tlock *tlck; struct maplock *maplock; TXN_LOCK(); /* * allocate a tlock */ lid = txLockAlloc(); tlck = lid_to_tlock(lid); /* * initialize tlock */ tlck->tid = tid; /* bind the tlock and the object */ tlck->flag = tlckINODELOCK; if (S_ISDIR(ip->i_mode)) tlck->flag |= tlckDIRECTORY; tlck->ip = ip; tlck->mp = NULL; tlck->type = type; /* * enqueue transaction lock to transaction/inode */ /* insert the tlock at tail of transaction tlock list */ if (tid) { tblk = tid_to_tblock(tid); if (tblk->next) lid_to_tlock(tblk->last)->next = lid; else tblk->next = lid; tlck->next = 0; tblk->last = lid; } /* anonymous transaction: * insert the tlock at head of inode anonymous tlock list */ else { tlck->next = jfs_ip->atlhead; jfs_ip->atlhead = lid; if (tlck->next == 0) { /* This inode's first anonymous transaction */ jfs_ip->atltail = lid; list_add_tail(&jfs_ip->anon_inode_list, &TxAnchor.anon_list); } } TXN_UNLOCK(); /* initialize type dependent area for maplock */ maplock = (struct maplock *) & tlck->lock; maplock->next = 0; maplock->maxcnt = 0; maplock->index = 0; return tlck; } /* * txLinelock() * * function: allocate a transaction lock for log vector list */ struct linelock *txLinelock(struct linelock * tlock) { lid_t lid; struct tlock *tlck; struct linelock *linelock; TXN_LOCK(); /* allocate a TxLock structure */ lid = txLockAlloc(); tlck = lid_to_tlock(lid); TXN_UNLOCK(); /* initialize linelock */ linelock = (struct linelock *) tlck; linelock->next = 0; linelock->flag = tlckLINELOCK; linelock->maxcnt = TLOCKLONG; linelock->index = 0; if (tlck->flag & tlckDIRECTORY) linelock->flag |= tlckDIRECTORY; /* append linelock after tlock */ linelock->next = tlock->next; tlock->next = lid; return linelock; } /* * transaction commit management * ----------------------------- */ /* * NAME: txCommit() * * FUNCTION: commit the changes to the objects specified in * clist. For journalled segments only the * changes of the caller are committed, ie by tid. * for non-journalled segments the data are flushed to * disk and then the change to the disk inode and indirect * blocks committed (so blocks newly allocated to the * segment will be made a part of the segment atomically). * * all of the segments specified in clist must be in * one file system. no more than 6 segments are needed * to handle all unix svcs. * * if the i_nlink field (i.e. disk inode link count) * is zero, and the type of inode is a regular file or * directory, or symbolic link , the inode is truncated * to zero length. the truncation is committed but the * VM resources are unaffected until it is closed (see * iput and iclose). * * PARAMETER: * * RETURN: * * serialization: * on entry the inode lock on each segment is assumed * to be held. * * i/o error: */ int txCommit(tid_t tid, /* transaction identifier */ int nip, /* number of inodes to commit */ struct inode **iplist, /* list of inode to commit */ int flag) { int rc = 0; struct commit cd; struct jfs_log *log; struct tblock *tblk; struct lrd *lrd; struct inode *ip; struct jfs_inode_info *jfs_ip; int k, n; ino_t top; struct super_block *sb; jfs_info("txCommit, tid = %d, flag = %d", tid, flag); /* is read-only file system ? */ if (isReadOnly(iplist[0])) { rc = -EROFS; goto TheEnd; } sb = cd.sb = iplist[0]->i_sb; cd.tid = tid; if (tid == 0) tid = txBegin(sb, 0); tblk = tid_to_tblock(tid); /* * initialize commit structure */ log = JFS_SBI(sb)->log; cd.log = log; /* initialize log record descriptor in commit */ lrd = &cd.lrd; lrd->logtid = cpu_to_le32(tblk->logtid); lrd->backchain = 0; tblk->xflag |= flag; if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) tblk->xflag |= COMMIT_LAZY; /* * prepare non-journaled objects for commit * * flush data pages of non-journaled file * to prevent the file getting non-initialized disk blocks * in case of crash. * (new blocks - ) */ cd.iplist = iplist; cd.nip = nip; /* * acquire transaction lock on (on-disk) inodes * * update on-disk inode from in-memory inode * acquiring transaction locks for AFTER records * on the on-disk inode of file object * * sort the inodes array by inode number in descending order * to prevent deadlock when acquiring transaction lock * of on-disk inodes on multiple on-disk inode pages by * multiple concurrent transactions */ for (k = 0; k < cd.nip; k++) { top = (cd.iplist[k])->i_ino; for (n = k + 1; n < cd.nip; n++) { ip = cd.iplist[n]; if (ip->i_ino > top) { top = ip->i_ino; cd.iplist[n] = cd.iplist[k]; cd.iplist[k] = ip; } } ip = cd.iplist[k]; jfs_ip = JFS_IP(ip); /* * BUGBUG - This code has temporarily been removed. The * intent is to ensure that any file data is written before * the metadata is committed to the journal. This prevents * uninitialized data from appearing in a file after the * journal has been replayed. (The uninitialized data * could be sensitive data removed by another user.) * * The problem now is that we are holding the IWRITELOCK * on the inode, and calling filemap_fdatawrite on an * unmapped page will cause a deadlock in jfs_get_block. * * The long term solution is to pare down the use of * IWRITELOCK. We are currently holding it too long. * We could also be smarter about which data pages need * to be written before the transaction is committed and * when we don't need to worry about it at all. * * if ((!S_ISDIR(ip->i_mode)) * && (tblk->flag & COMMIT_DELETE) == 0) * filemap_write_and_wait(ip->i_mapping); */ /* * Mark inode as not dirty. It will still be on the dirty * inode list, but we'll know not to commit it again unless * it gets marked dirty again */ clear_cflag(COMMIT_Dirty, ip); /* inherit anonymous tlock(s) of inode */ if (jfs_ip->atlhead) { lid_to_tlock(jfs_ip->atltail)->next = tblk->next; tblk->next = jfs_ip->atlhead; if (!tblk->last) tblk->last = jfs_ip->atltail; jfs_ip->atlhead = jfs_ip->atltail = 0; TXN_LOCK(); list_del_init(&jfs_ip->anon_inode_list); TXN_UNLOCK(); } /* * acquire transaction lock on on-disk inode page * (become first tlock of the tblk's tlock list) */ if (((rc = diWrite(tid, ip)))) goto out; } /* * write log records from transaction locks * * txUpdateMap() resets XAD_NEW in XAD. */ txLog(log, tblk, &cd); /* * Ensure that inode isn't reused before * lazy commit thread finishes processing */ if (tblk->xflag & COMMIT_DELETE) { ihold(tblk->u.ip); /* * Avoid a rare deadlock * * If the inode is locked, we may be blocked in * jfs_commit_inode. If so, we don't want the * lazy_commit thread doing the last iput() on the inode * since that may block on the locked inode. Instead, * commit the transaction synchronously, so the last iput * will be done by the calling thread (or later) */ /* * I believe this code is no longer needed. Splitting I_LOCK * into two bits, I_NEW and I_SYNC should prevent this * deadlock as well. But since I don't have a JFS testload * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern */ if (tblk->u.ip->i_state & I_SYNC) tblk->xflag &= ~COMMIT_LAZY; } ASSERT((!(tblk->xflag & COMMIT_DELETE)) || ((tblk->u.ip->i_nlink == 0) && !test_cflag(COMMIT_Nolink, tblk->u.ip))); /* * write COMMIT log record */ lrd->type = cpu_to_le16(LOG_COMMIT); lrd->length = 0; lmLog(log, tblk, lrd, NULL); lmGroupCommit(log, tblk); /* * - transaction is now committed - */ /* * force pages in careful update * (imap addressing structure update) */ if (flag & COMMIT_FORCE) txForce(tblk); /* * update allocation map. * * update inode allocation map and inode: * free pager lock on memory object of inode if any. * update block allocation map. * * txUpdateMap() resets XAD_NEW in XAD. */ if (tblk->xflag & COMMIT_FORCE) txUpdateMap(tblk); /* * free transaction locks and pageout/free pages */ txRelease(tblk); if ((tblk->flag & tblkGC_LAZY) == 0) txUnlock(tblk); /* * reset in-memory object state */ for (k = 0; k < cd.nip; k++) { ip = cd.iplist[k]; jfs_ip = JFS_IP(ip); /* * reset in-memory inode state */ jfs_ip->bxflag = 0; jfs_ip->blid = 0; } out: if (rc != 0) txAbort(tid, 1); TheEnd: jfs_info("txCommit: tid = %d, returning %d", tid, rc); return rc; } /* * NAME: txLog() * * FUNCTION: Writes AFTER log records for all lines modified * by tid for segments specified by inodes in comdata. * Code assumes only WRITELOCKS are recorded in lockwords. * * PARAMETERS: * * RETURN : */ static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd) { struct inode *ip; lid_t lid; struct tlock *tlck; struct lrd *lrd = &cd->lrd; /* * write log record(s) for each tlock of transaction, */ for (lid = tblk->next; lid; lid = tlck->next) { tlck = lid_to_tlock(lid); tlck->flag |= tlckLOG; /* initialize lrd common */ ip = tlck->ip; lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate); lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset); lrd->log.redopage.inode = cpu_to_le32(ip->i_ino); /* write log record of page from the tlock */ switch (tlck->type & tlckTYPE) { case tlckXTREE: xtLog(log, tblk, lrd, tlck); break; case tlckDTREE: dtLog(log, tblk, lrd, tlck); break; case tlckINODE: diLog(log, tblk, lrd, tlck, cd); break; case tlckMAP: mapLog(log, tblk, lrd, tlck); break; case tlckDATA: dataLog(log, tblk, lrd, tlck); break; default: jfs_err("UFO tlock:0x%p", tlck); } } return; } /* * diLog() * * function: log inode tlock and format maplock to update bmap; */ static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck, struct commit *cd) { struct metapage *mp; pxd_t *pxd; struct pxd_lock *pxdlock; mp = tlck->mp; /* initialize as REDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_INODE); lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE); pxd = &lrd->log.redopage.pxd; /* * inode after image */ if (tlck->type & tlckENTRY) { /* log after-image for logredo(): */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; } else if (tlck->type & tlckFREE) { /* * free inode extent * * (pages of the freed inode extent have been invalidated and * a maplock for free of the extent has been formatted at * txLock() time); * * the tlock had been acquired on the inode allocation map page * (iag) that specifies the freed extent, even though the map * page is not itself logged, to prevent pageout of the map * page before the log; */ /* log LOG_NOREDOINOEXT of the freed inode extent for * logredo() to start NoRedoPage filters, and to update * imap and bmap for free of the extent; */ lrd->type = cpu_to_le16(LOG_NOREDOINOEXT); /* * For the LOG_NOREDOINOEXT record, we need * to pass the IAG number and inode extent * index (within that IAG) from which the * extent is being released. These have been * passed to us in the iplist[1] and iplist[2]. */ lrd->log.noredoinoext.iagnum = cpu_to_le32((u32) (size_t) cd->iplist[1]); lrd->log.noredoinoext.inoext_idx = cpu_to_le32((u32) (size_t) cd->iplist[2]); pxdlock = (struct pxd_lock *) & tlck->lock; *pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* update bmap */ tlck->flag |= tlckUPDATEMAP; /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; } else jfs_err("diLog: UFO type tlck:0x%p", tlck); return; } /* * dataLog() * * function: log data tlock */ static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, struct tlock *tlck) { struct metapage *mp; pxd_t *pxd; mp = tlck->mp; /* initialize as REDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_DATA); lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE); pxd = &lrd->log.redopage.pxd; /* log after-image for logredo(): */ lrd->type = cpu_to_le16(LOG_REDOPAGE); if (jfs_dirtable_inline(tlck->ip)) { /* * The table has been truncated, we've must have deleted * the last entry, so don't bother logging this */ mp->lid = 0; grab_metapage(mp); metapage_homeok(mp); discard_metapage(mp); tlck->mp = NULL; return; } PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * dtLog() * * function: log dtree tlock and format maplock to update bmap; */ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { struct metapage *mp; struct pxd_lock *pxdlock; pxd_t *pxd; mp = tlck->mp; /* initialize as REDOPAGE/NOREDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_DTREE); lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE); pxd = &lrd->log.redopage.pxd; if (tlck->type & tlckBTROOT) lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); /* * page extension via relocation: entry insertion; * page extension in-place: entry insertion; * new right page from page split, reinitialized in-line * root from root page split: entry insertion; */ if (tlck->type & (tlckNEW | tlckEXTEND)) { /* log after-image of the new page for logredo(): * mark log (LOG_NEW) for logredo() to initialize * freelist and update bmap for alloc of the new page; */ lrd->type = cpu_to_le16(LOG_REDOPAGE); if (tlck->type & tlckEXTEND) lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); else lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* format a maplock for txUpdateMap() to update bPMAP for * alloc of the new page; */ if (tlck->type & tlckBTROOT) return; tlck->flag |= tlckUPDATEMAP; pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckALLOCPXD; pxdlock->pxd = *pxd; pxdlock->index = 1; /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * entry insertion/deletion, * sibling page link update (old right page before split); */ if (tlck->type & (tlckENTRY | tlckRELINK)) { /* log after-image for logredo(): */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * page deletion: page has been invalidated * page relocation: source extent * * a maplock for free of the page has been formatted * at txLock() time); */ if (tlck->type & (tlckFREE | tlckRELOCATE)) { /* log LOG_NOREDOPAGE of the deleted page for logredo() * to start NoRedoPage filter and to update bmap for free * of the deletd page */ lrd->type = cpu_to_le16(LOG_NOREDOPAGE); pxdlock = (struct pxd_lock *) & tlck->lock; *pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time; */ tlck->flag |= tlckUPDATEMAP; } return; } /* * xtLog() * * function: log xtree tlock and format maplock to update bmap; */ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { struct inode *ip; struct metapage *mp; xtpage_t *p; struct xtlock *xtlck; struct maplock *maplock; struct xdlistlock *xadlock; struct pxd_lock *pxdlock; pxd_t *page_pxd; int next, lwm, hwm; ip = tlck->ip; mp = tlck->mp; /* initialize as REDOPAGE/NOREDOPAGE record format */ lrd->log.redopage.type = cpu_to_le16(LOG_XTREE); lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE); page_pxd = &lrd->log.redopage.pxd; if (tlck->type & tlckBTROOT) { lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); p = (xtpage_t *) &JFS_IP(ip)->i_xtroot; if (S_ISDIR(ip->i_mode)) lrd->log.redopage.type |= cpu_to_le16(LOG_DIR_XTREE); } else p = (xtpage_t *) mp->data; next = le16_to_cpu(p->header.nextindex); xtlck = (struct xtlock *) & tlck->lock; maplock = (struct maplock *) & tlck->lock; xadlock = (struct xdlistlock *) maplock; /* * entry insertion/extension; * sibling page link update (old right page before split); */ if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { /* log after-image for logredo(): * logredo() will update bmap for alloc of new/extended * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from * after-image of XADlist; * logredo() resets (XAD_NEW|XAD_EXTEND) flag when * applying the after-image to the meta-data page. */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(page_pxd, mp->index); PXDlength(page_pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* format a maplock for txUpdateMap() to update bPMAP * for alloc of new/extended extents of XAD[lwm:next) * from the page itself; * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. */ lwm = xtlck->lwm.offset; if (lwm == 0) lwm = XTPAGEMAXSLOT; if (lwm == next) goto out; if (lwm > next) { jfs_err("xtLog: lwm > next"); goto out; } tlck->flag |= tlckUPDATEMAP; xadlock->flag = mlckALLOCXADLIST; xadlock->count = next - lwm; if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { int i; pxd_t *pxd; /* * Lazy commit may allow xtree to be modified before * txUpdateMap runs. Copy xad into linelock to * preserve correct data. * * We can fit twice as may pxd's as xads in the lock */ xadlock->flag = mlckALLOCPXDLIST; pxd = xadlock->xdlist = &xtlck->pxdlock; for (i = 0; i < xadlock->count; i++) { PXDaddress(pxd, addressXAD(&p->xad[lwm + i])); PXDlength(pxd, lengthXAD(&p->xad[lwm + i])); p->xad[lwm + i].flag &= ~(XAD_NEW | XAD_EXTENDED); pxd++; } } else { /* * xdlist will point to into inode's xtree, ensure * that transaction is not committed lazily. */ xadlock->flag = mlckALLOCXADLIST; xadlock->xdlist = &p->xad[lwm]; tblk->xflag &= ~COMMIT_LAZY; } jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d", tlck->ip, mp, tlck, lwm, xadlock->count); maplock->index = 1; out: /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; return; } /* * page deletion: file deletion/truncation (ref. xtTruncate()) * * (page will be invalidated after log is written and bmap * is updated from the page); */ if (tlck->type & tlckFREE) { /* LOG_NOREDOPAGE log for NoRedoPage filter: * if page free from file delete, NoRedoFile filter from * inode image of zero link count will subsume NoRedoPage * filters for each page; * if page free from file truncattion, write NoRedoPage * filter; * * upadte of block allocation map for the page itself: * if page free from deletion and truncation, LOG_UPDATEMAP * log for the page itself is generated from processing * its parent page xad entries; */ /* if page free from file truncation, log LOG_NOREDOPAGE * of the deleted page for logredo() to start NoRedoPage * filter for the page; */ if (tblk->xflag & COMMIT_TRUNCATE) { /* write NOREDOPAGE for the page */ lrd->type = cpu_to_le16(LOG_NOREDOPAGE); PXDaddress(page_pxd, mp->index); PXDlength(page_pxd, mp->logical_size >> tblk->sb-> s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); if (tlck->type & tlckBTROOT) { /* Empty xtree must be logged */ lrd->type = cpu_to_le16(LOG_REDOPAGE); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); } } /* init LOG_UPDATEMAP of the freed extents * XAD[XTENTRYSTART:hwm) from the deleted page itself * for logredo() to update bmap; */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); xtlck = (struct xtlock *) & tlck->lock; hwm = xtlck->hwm.offset; lrd->log.updatemap.nxd = cpu_to_le16(hwm - XTENTRYSTART + 1); /* reformat linelock for lmLog() */ xtlck->header.offset = XTENTRYSTART; xtlck->header.length = hwm - XTENTRYSTART + 1; xtlck->index = 1; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* format a maplock for txUpdateMap() to update bmap * to free extents of XAD[XTENTRYSTART:hwm) from the * deleted page itself; */ tlck->flag |= tlckUPDATEMAP; xadlock->count = hwm - XTENTRYSTART + 1; if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { int i; pxd_t *pxd; /* * Lazy commit may allow xtree to be modified before * txUpdateMap runs. Copy xad into linelock to * preserve correct data. * * We can fit twice as may pxd's as xads in the lock */ xadlock->flag = mlckFREEPXDLIST; pxd = xadlock->xdlist = &xtlck->pxdlock; for (i = 0; i < xadlock->count; i++) { PXDaddress(pxd, addressXAD(&p->xad[XTENTRYSTART + i])); PXDlength(pxd, lengthXAD(&p->xad[XTENTRYSTART + i])); pxd++; } } else { /* * xdlist will point to into inode's xtree, ensure * that transaction is not committed lazily. */ xadlock->flag = mlckFREEXADLIST; xadlock->xdlist = &p->xad[XTENTRYSTART]; tblk->xflag &= ~COMMIT_LAZY; } jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2", tlck->ip, mp, xadlock->count); maplock->index = 1; /* mark page as invalid */ if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode)) && !(tlck->type & tlckBTROOT)) tlck->flag |= tlckFREEPAGE; /* else (tblk->xflag & COMMIT_PMAP) ? release the page; */ return; } /* * page/entry truncation: file truncation (ref. xtTruncate()) * * |----------+------+------+---------------| * | | | * | | hwm - hwm before truncation * | next - truncation point * lwm - lwm before truncation * header ? */ if (tlck->type & tlckTRUNCATE) { pxd_t pxd; /* truncated extent of xad */ int twm; /* * For truncation the entire linelock may be used, so it would * be difficult to store xad list in linelock itself. * Therefore, we'll just force transaction to be committed * synchronously, so that xtree pages won't be changed before * txUpdateMap runs. */ tblk->xflag &= ~COMMIT_LAZY; lwm = xtlck->lwm.offset; if (lwm == 0) lwm = XTPAGEMAXSLOT; hwm = xtlck->hwm.offset; twm = xtlck->twm.offset; /* * write log records */ /* log after-image for logredo(): * * logredo() will update bmap for alloc of new/extended * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from * after-image of XADlist; * logredo() resets (XAD_NEW|XAD_EXTEND) flag when * applying the after-image to the meta-data page. */ lrd->type = cpu_to_le16(LOG_REDOPAGE); PXDaddress(page_pxd, mp->index); PXDlength(page_pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); /* * truncate entry XAD[twm == next - 1]: */ if (twm == next - 1) { /* init LOG_UPDATEMAP for logredo() to update bmap for * free of truncated delta extent of the truncated * entry XAD[next - 1]: * (xtlck->pxdlock = truncated delta extent); */ pxdlock = (struct pxd_lock *) & xtlck->pxdlock; /* assert(pxdlock->type & tlckTRUNCATE); */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); lrd->log.updatemap.nxd = cpu_to_le16(1); lrd->log.updatemap.pxd = pxdlock->pxd; pxd = pxdlock->pxd; /* save to format maplock */ lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); } /* * free entries XAD[next:hwm]: */ if (hwm >= next) { /* init LOG_UPDATEMAP of the freed extents * XAD[next:hwm] from the deleted page itself * for logredo() to update bmap; */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); xtlck = (struct xtlock *) & tlck->lock; hwm = xtlck->hwm.offset; lrd->log.updatemap.nxd = cpu_to_le16(hwm - next + 1); /* reformat linelock for lmLog() */ xtlck->header.offset = next; xtlck->header.length = hwm - next + 1; xtlck->index = 1; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); } /* * format maplock(s) for txUpdateMap() to update bmap */ maplock->index = 0; /* * allocate entries XAD[lwm:next): */ if (lwm < next) { /* format a maplock for txUpdateMap() to update bPMAP * for alloc of new/extended extents of XAD[lwm:next) * from the page itself; * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. */ tlck->flag |= tlckUPDATEMAP; xadlock->flag = mlckALLOCXADLIST; xadlock->count = next - lwm; xadlock->xdlist = &p->xad[lwm]; jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d", tlck->ip, mp, xadlock->count, lwm, next); maplock->index++; xadlock++; } /* * truncate entry XAD[twm == next - 1]: */ if (twm == next - 1) { /* format a maplock for txUpdateMap() to update bmap * to free truncated delta extent of the truncated * entry XAD[next - 1]; * (xtlck->pxdlock = truncated delta extent); */ tlck->flag |= tlckUPDATEMAP; pxdlock = (struct pxd_lock *) xadlock; pxdlock->flag = mlckFREEPXD; pxdlock->count = 1; pxdlock->pxd = pxd; jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d", ip, mp, pxdlock->count, hwm); maplock->index++; xadlock++; } /* * free entries XAD[next:hwm]: */ if (hwm >= next) { /* format a maplock for txUpdateMap() to update bmap * to free extents of XAD[next:hwm] from thedeleted * page itself; */ tlck->flag |= tlckUPDATEMAP; xadlock->flag = mlckFREEXADLIST; xadlock->count = hwm - next + 1; xadlock->xdlist = &p->xad[next]; jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d", tlck->ip, mp, xadlock->count, next, hwm); maplock->index++; } /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; } return; } /* * mapLog() * * function: log from maplock of freed data extents; */ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck) { struct pxd_lock *pxdlock; int i, nlock; pxd_t *pxd; /* * page relocation: free the source page extent * * a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time saving the src * relocated page address; */ if (tlck->type & tlckRELOCATE) { /* log LOG_NOREDOPAGE of the old relocated page * for logredo() to start NoRedoPage filter; */ lrd->type = cpu_to_le16(LOG_NOREDOPAGE); pxdlock = (struct pxd_lock *) & tlck->lock; pxd = &lrd->log.redopage.pxd; *pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* (N.B. currently, logredo() does NOT update bmap * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE); * if page free from relocation, LOG_UPDATEMAP log is * specifically generated now for logredo() * to update bmap for free of src relocated page; * (new flag LOG_RELOCATE may be introduced which will * inform logredo() to start NORedoPage filter and also * update block allocation map at the same time, thus * avoiding an extra log write); */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); lrd->log.updatemap.nxd = cpu_to_le16(1); lrd->log.updatemap.pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); /* a maplock for txUpdateMap() for free of the page * has been formatted at txLock() time; */ tlck->flag |= tlckUPDATEMAP; return; } /* * Otherwise it's not a relocate request * */ else { /* log LOG_UPDATEMAP for logredo() to update bmap for * free of truncated/relocated delta extent of the data; * e.g.: external EA extent, relocated/truncated extent * from xtTailgate(); */ lrd->type = cpu_to_le16(LOG_UPDATEMAP); pxdlock = (struct pxd_lock *) & tlck->lock; nlock = pxdlock->index; for (i = 0; i < nlock; i++, pxdlock++) { if (pxdlock->flag & mlckALLOCPXD) lrd->log.updatemap.type = cpu_to_le16(LOG_ALLOCPXD); else lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); lrd->log.updatemap.nxd = cpu_to_le16(1); lrd->log.updatemap.pxd = pxdlock->pxd; lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); jfs_info("mapLog: xaddr:0x%lx xlen:0x%x", (ulong) addressPXD(&pxdlock->pxd), lengthPXD(&pxdlock->pxd)); } /* update bmap */ tlck->flag |= tlckUPDATEMAP; } } /* * txEA() * * function: acquire maplock for EA/ACL extents or * set COMMIT_INLINE flag; */ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) { struct tlock *tlck = NULL; struct pxd_lock *maplock = NULL, *pxdlock = NULL; /* * format maplock for alloc of new EA extent */ if (newea) { /* Since the newea could be a completely zeroed entry we need to * check for the two flags which indicate we should actually * commit new EA data */ if (newea->flag & DXD_EXTENT) { tlck = txMaplock(tid, ip, tlckMAP); maplock = (struct pxd_lock *) & tlck->lock; pxdlock = (struct pxd_lock *) maplock; pxdlock->flag = mlckALLOCPXD; PXDaddress(&pxdlock->pxd, addressDXD(newea)); PXDlength(&pxdlock->pxd, lengthDXD(newea)); pxdlock++; maplock->index = 1; } else if (newea->flag & DXD_INLINE) { tlck = NULL; set_cflag(COMMIT_Inlineea, ip); } } /* * format maplock for free of old EA extent */ if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) { if (tlck == NULL) { tlck = txMaplock(tid, ip, tlckMAP); maplock = (struct pxd_lock *) & tlck->lock; pxdlock = (struct pxd_lock *) maplock; maplock->index = 0; } pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, addressDXD(oldea)); PXDlength(&pxdlock->pxd, lengthDXD(oldea)); maplock->index++; } } /* * txForce() * * function: synchronously write pages locked by transaction * after txLog() but before txUpdateMap(); */ static void txForce(struct tblock * tblk) { struct tlock *tlck; lid_t lid, next; struct metapage *mp; /* * reverse the order of transaction tlocks in * careful update order of address index pages * (right to left, bottom up) */ tlck = lid_to_tlock(tblk->next); lid = tlck->next; tlck->next = 0; while (lid) { tlck = lid_to_tlock(lid); next = tlck->next; tlck->next = tblk->next; tblk->next = lid; lid = next; } /* * synchronously write the page, and * hold the page for txUpdateMap(); */ for (lid = tblk->next; lid; lid = next) { tlck = lid_to_tlock(lid); next = tlck->next; if ((mp = tlck->mp) != NULL && (tlck->type & tlckBTROOT) == 0) { assert(mp->xflag & COMMIT_PAGE); if (tlck->flag & tlckWRITEPAGE) { tlck->flag &= ~tlckWRITEPAGE; /* do not release page to freelist */ force_metapage(mp); #if 0 /* * The "right" thing to do here is to * synchronously write the metadata. * With the current implementation this * is hard since write_metapage requires * us to kunmap & remap the page. If we * have tlocks pointing into the metadata * pages, we don't want to do this. I think * we can get by with synchronously writing * the pages when they are released. */ assert(mp->nohomeok); set_bit(META_dirty, &mp->flag); set_bit(META_sync, &mp->flag); #endif } } } } /* * txUpdateMap() * * function: update persistent allocation map (and working map * if appropriate); * * parameter: */ static void txUpdateMap(struct tblock * tblk) { struct inode *ip; struct inode *ipimap; lid_t lid; struct tlock *tlck; struct maplock *maplock; struct pxd_lock pxdlock; int maptype; int k, nlock; struct metapage *mp = NULL; ipimap = JFS_SBI(tblk->sb)->ipimap; maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP; /* * update block allocation map * * update allocation state in pmap (and wmap) and * update lsn of the pmap page; */ /* * scan each tlock/page of transaction for block allocation/free: * * for each tlock/page of transaction, update map. * ? are there tlock for pmap and pwmap at the same time ? */ for (lid = tblk->next; lid; lid = tlck->next) { tlck = lid_to_tlock(lid); if ((tlck->flag & tlckUPDATEMAP) == 0) continue; if (tlck->flag & tlckFREEPAGE) { /* * Another thread may attempt to reuse freed space * immediately, so we want to get rid of the metapage * before anyone else has a chance to get it. * Lock metapage, update maps, then invalidate * the metapage. */ mp = tlck->mp; ASSERT(mp->xflag & COMMIT_PAGE); grab_metapage(mp); } /* * extent list: * . in-line PXD list: * . out-of-line XAD list: */ maplock = (struct maplock *) & tlck->lock; nlock = maplock->index; for (k = 0; k < nlock; k++, maplock++) { /* * allocate blocks in persistent map: * * blocks have been allocated from wmap at alloc time; */ if (maplock->flag & mlckALLOC) { txAllocPMap(ipimap, maplock, tblk); } /* * free blocks in persistent and working map: * blocks will be freed in pmap and then in wmap; * * ? tblock specifies the PMAP/PWMAP based upon * transaction * * free blocks in persistent map: * blocks will be freed from wmap at last reference * release of the object for regular files; * * Alway free blocks from both persistent & working * maps for directories */ else { /* (maplock->flag & mlckFREE) */ if (tlck->flag & tlckDIRECTORY) txFreeMap(ipimap, maplock, tblk, COMMIT_PWMAP); else txFreeMap(ipimap, maplock, tblk, maptype); } } if (tlck->flag & tlckFREEPAGE) { if (!(tblk->flag & tblkGC_LAZY)) { /* This is equivalent to txRelease */ ASSERT(mp->lid == lid); tlck->mp->lid = 0; } assert(mp->nohomeok == 1); metapage_homeok(mp); discard_metapage(mp); tlck->mp = NULL; } } /* * update inode allocation map * * update allocation state in pmap and * update lsn of the pmap page; * update in-memory inode flag/state * * unlock mapper/write lock */ if (tblk->xflag & COMMIT_CREATE) { diUpdatePMap(ipimap, tblk->ino, false, tblk); /* update persistent block allocation map * for the allocation of inode extent; */ pxdlock.flag = mlckALLOCPXD; pxdlock.pxd = tblk->u.ixpxd; pxdlock.index = 1; txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk); } else if (tblk->xflag & COMMIT_DELETE) { ip = tblk->u.ip; diUpdatePMap(ipimap, ip->i_ino, true, tblk); iput(ip); } } /* * txAllocPMap() * * function: allocate from persistent map; * * parameter: * ipbmap - * malock - * xad list: * pxd: * * maptype - * allocate from persistent map; * free from persistent map; * (e.g., tmp file - free from working map at releae * of last reference); * free from persistent and working map; * * lsn - log sequence number; */ static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk) { struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct xdlistlock *xadlistlock; xad_t *xad; s64 xaddr; int xlen; struct pxd_lock *pxdlock; struct xdlistlock *pxdlistlock; pxd_t *pxd; int n; /* * allocate from persistent map; */ if (maplock->flag & mlckALLOCXADLIST) { xadlistlock = (struct xdlistlock *) maplock; xad = xadlistlock->xdlist; for (n = 0; n < xadlistlock->count; n++, xad++) { if (xad->flag & (XAD_NEW | XAD_EXTENDED)) { xaddr = addressXAD(xad); xlen = lengthXAD(xad); dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); xad->flag &= ~(XAD_NEW | XAD_EXTENDED); jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } else if (maplock->flag & mlckALLOCPXD) { pxdlock = (struct pxd_lock *) maplock; xaddr = addressPXD(&pxdlock->pxd); xlen = lengthPXD(&pxdlock->pxd); dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } else { /* (maplock->flag & mlckALLOCPXDLIST) */ pxdlistlock = (struct xdlistlock *) maplock; pxd = pxdlistlock->xdlist; for (n = 0; n < pxdlistlock->count; n++, pxd++) { xaddr = addressPXD(pxd); xlen = lengthPXD(pxd); dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } /* * txFreeMap() * * function: free from persistent and/or working map; * * todo: optimization */ void txFreeMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk, int maptype) { struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct xdlistlock *xadlistlock; xad_t *xad; s64 xaddr; int xlen; struct pxd_lock *pxdlock; struct xdlistlock *pxdlistlock; pxd_t *pxd; int n; jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x", tblk, maplock, maptype); /* * free from persistent map; */ if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) { if (maplock->flag & mlckFREEXADLIST) { xadlistlock = (struct xdlistlock *) maplock; xad = xadlistlock->xdlist; for (n = 0; n < xadlistlock->count; n++, xad++) { if (!(xad->flag & XAD_NEW)) { xaddr = addressXAD(xad); xlen = lengthXAD(xad); dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, tblk); jfs_info("freePMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } else if (maplock->flag & mlckFREEPXD) { pxdlock = (struct pxd_lock *) maplock; xaddr = addressPXD(&pxdlock->pxd); xlen = lengthPXD(&pxdlock->pxd); dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, tblk); jfs_info("freePMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } else { /* (maplock->flag & mlckALLOCPXDLIST) */ pxdlistlock = (struct xdlistlock *) maplock; pxd = pxdlistlock->xdlist; for (n = 0; n < pxdlistlock->count; n++, pxd++) { xaddr = addressPXD(pxd); xlen = lengthPXD(pxd); dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, tblk); jfs_info("freePMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } /* * free from working map; */ if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) { if (maplock->flag & mlckFREEXADLIST) { xadlistlock = (struct xdlistlock *) maplock; xad = xadlistlock->xdlist; for (n = 0; n < xadlistlock->count; n++, xad++) { xaddr = addressXAD(xad); xlen = lengthXAD(xad); dbFree(ip, xaddr, (s64) xlen); xad->flag = 0; jfs_info("freeWMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } else if (maplock->flag & mlckFREEPXD) { pxdlock = (struct pxd_lock *) maplock; xaddr = addressPXD(&pxdlock->pxd); xlen = lengthPXD(&pxdlock->pxd); dbFree(ip, xaddr, (s64) xlen); jfs_info("freeWMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } else { /* (maplock->flag & mlckFREEPXDLIST) */ pxdlistlock = (struct xdlistlock *) maplock; pxd = pxdlistlock->xdlist; for (n = 0; n < pxdlistlock->count; n++, pxd++) { xaddr = addressPXD(pxd); xlen = lengthPXD(pxd); dbFree(ip, xaddr, (s64) xlen); jfs_info("freeWMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); } } } } /* * txFreelock() * * function: remove tlock from inode anonymous locklist */ void txFreelock(struct inode *ip) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct tlock *xtlck, *tlck; lid_t xlid = 0, lid; if (!jfs_ip->atlhead) return; TXN_LOCK(); xtlck = (struct tlock *) &jfs_ip->atlhead; while ((lid = xtlck->next) != 0) { tlck = lid_to_tlock(lid); if (tlck->flag & tlckFREELOCK) { xtlck->next = tlck->next; txLockFree(lid); } else { xtlck = tlck; xlid = lid; } } if (jfs_ip->atlhead) jfs_ip->atltail = xlid; else { jfs_ip->atltail = 0; /* * If inode was on anon_list, remove it */ list_del_init(&jfs_ip->anon_inode_list); } TXN_UNLOCK(); } /* * txAbort() * * function: abort tx before commit; * * frees line-locks and segment locks for all * segments in comdata structure. * Optionally sets state of file-system to FM_DIRTY in super-block. * log age of page-frames in memory for which caller has * are reset to 0 (to avoid logwarap). */ void txAbort(tid_t tid, int dirty) { lid_t lid, next; struct metapage *mp; struct tblock *tblk = tid_to_tblock(tid); struct tlock *tlck; /* * free tlocks of the transaction */ for (lid = tblk->next; lid; lid = next) { tlck = lid_to_tlock(lid); next = tlck->next; mp = tlck->mp; JFS_IP(tlck->ip)->xtlid = 0; if (mp) { mp->lid = 0; /* * reset lsn of page to avoid logwarap: * * (page may have been previously committed by another * transaction(s) but has not been paged, i.e., * it may be on logsync list even though it has not * been logged for the current tx.) */ if (mp->xflag & COMMIT_PAGE && mp->lsn) LogSyncRelease(mp); } /* insert tlock at head of freelist */ TXN_LOCK(); txLockFree(lid); TXN_UNLOCK(); } /* caller will free the transaction block */ tblk->next = tblk->last = 0; /* * mark filesystem dirty */ if (dirty) jfs_error(tblk->sb, "\n"); return; } /* * txLazyCommit(void) * * All transactions except those changing ipimap (COMMIT_FORCE) are * processed by this routine. This insures that the inode and block * allocation maps are updated in order. For synchronous transactions, * let the user thread finish processing after txUpdateMap() is called. */ static void txLazyCommit(struct tblock * tblk) { struct jfs_log *log; while (((tblk->flag & tblkGC_READY) == 0) && ((tblk->flag & tblkGC_UNLOCKED) == 0)) { /* We must have gotten ahead of the user thread */ jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk); yield(); } jfs_info("txLazyCommit: processing tblk 0x%p", tblk); txUpdateMap(tblk); log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; spin_lock_irq(&log->gclock); // LOGGC_LOCK tblk->flag |= tblkGC_COMMITTED; if (tblk->flag & tblkGC_READY) log->gcrtc--; wake_up_all(&tblk->gcwait); // LOGGC_WAKEUP /* * Can't release log->gclock until we've tested tblk->flag */ if (tblk->flag & tblkGC_LAZY) { spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK txUnlock(tblk); tblk->flag &= ~tblkGC_LAZY; txEnd(tblk - TxBlock); /* Convert back to tid */ } else spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK jfs_info("txLazyCommit: done: tblk = 0x%p", tblk); } /* * jfs_lazycommit(void) * * To be run as a kernel daemon. If lbmIODone is called in an interrupt * context, or where blocking is not wanted, this routine will process * committed transactions from the unlock queue. */ int jfs_lazycommit(void *arg) { int WorkDone; struct tblock *tblk; unsigned long flags; struct jfs_sb_info *sbi; set_freezable(); do { LAZY_LOCK(flags); jfs_commit_thread_waking = 0; /* OK to wake another thread */ while (!list_empty(&TxAnchor.unlock_queue)) { WorkDone = 0; list_for_each_entry(tblk, &TxAnchor.unlock_queue, cqueue) { sbi = JFS_SBI(tblk->sb); /* * For each volume, the transactions must be * handled in order. If another commit thread * is handling a tblk for this superblock, * skip it */ if (sbi->commit_state & IN_LAZYCOMMIT) continue; sbi->commit_state |= IN_LAZYCOMMIT; WorkDone = 1; /* * Remove transaction from queue */ list_del(&tblk->cqueue); LAZY_UNLOCK(flags); txLazyCommit(tblk); LAZY_LOCK(flags); sbi->commit_state &= ~IN_LAZYCOMMIT; /* * Don't continue in the for loop. (We can't * anyway, it's unsafe!) We want to go back to * the beginning of the list. */ break; } /* If there was nothing to do, don't continue */ if (!WorkDone) break; } /* In case a wakeup came while all threads were active */ jfs_commit_thread_waking = 0; if (freezing(current)) { LAZY_UNLOCK(flags); try_to_freeze(); } else { DECLARE_WAITQUEUE(wq, current); add_wait_queue(&jfs_commit_thread_wait, &wq); set_current_state(TASK_INTERRUPTIBLE); LAZY_UNLOCK(flags); schedule(); remove_wait_queue(&jfs_commit_thread_wait, &wq); } } while (!kthread_should_stop()); if (!list_empty(&TxAnchor.unlock_queue)) jfs_err("jfs_lazycommit being killed w/pending transactions!"); else jfs_info("jfs_lazycommit being killed"); return 0; } void txLazyUnlock(struct tblock * tblk) { unsigned long flags; LAZY_LOCK(flags); list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue); /* * Don't wake up a commit thread if there is already one servicing * this superblock, or if the last one we woke up hasn't started yet. */ if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) && !jfs_commit_thread_waking) { jfs_commit_thread_waking = 1; wake_up(&jfs_commit_thread_wait); } LAZY_UNLOCK(flags); } static void LogSyncRelease(struct metapage * mp) { struct jfs_log *log = mp->log; assert(mp->nohomeok); assert(log); metapage_homeok(mp); } /* * txQuiesce * * Block all new transactions and push anonymous transactions to * completion * * This does almost the same thing as jfs_sync below. We don't * worry about deadlocking when jfs_tlocks_low is set, since we would * expect jfs_sync to get us out of that jam. */ void txQuiesce(struct super_block *sb) { struct inode *ip; struct jfs_inode_info *jfs_ip; struct jfs_log *log = JFS_SBI(sb)->log; tid_t tid; set_bit(log_QUIESCE, &log->flag); TXN_LOCK(); restart: while (!list_empty(&TxAnchor.anon_list)) { jfs_ip = list_entry(TxAnchor.anon_list.next, struct jfs_inode_info, anon_inode_list); ip = &jfs_ip->vfs_inode; /* * inode will be removed from anonymous list * when it is committed */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE); mutex_lock(&jfs_ip->commit_mutex); txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); /* * Just to be safe. I don't know how * long we can run without blocking */ cond_resched(); TXN_LOCK(); } /* * If jfs_sync is running in parallel, there could be some inodes * on anon_list2. Let's check. */ if (!list_empty(&TxAnchor.anon_list2)) { list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); goto restart; } TXN_UNLOCK(); /* * We may need to kick off the group commit */ jfs_flush_journal(log, 0); } /* * txResume() * * Allows transactions to start again following txQuiesce */ void txResume(struct super_block *sb) { struct jfs_log *log = JFS_SBI(sb)->log; clear_bit(log_QUIESCE, &log->flag); TXN_WAKEUP(&log->syncwait); } /* * jfs_sync(void) * * To be run as a kernel daemon. This is awakened when tlocks run low. * We write any inodes that have anonymous tlocks so they will become * available. */ int jfs_sync(void *arg) { struct inode *ip; struct jfs_inode_info *jfs_ip; tid_t tid; set_freezable(); do { /* * write each inode on the anonymous inode list */ TXN_LOCK(); while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) { jfs_ip = list_entry(TxAnchor.anon_list.next, struct jfs_inode_info, anon_inode_list); ip = &jfs_ip->vfs_inode; if (! igrab(ip)) { /* * Inode is being freed */ list_del_init(&jfs_ip->anon_inode_list); } else if (mutex_trylock(&jfs_ip->commit_mutex)) { /* * inode will be removed from anonymous list * when it is committed */ TXN_UNLOCK(); tid = txBegin(ip->i_sb, COMMIT_INODE); txCommit(tid, 1, &ip, 0); txEnd(tid); mutex_unlock(&jfs_ip->commit_mutex); iput(ip); /* * Just to be safe. I don't know how * long we can run without blocking */ cond_resched(); TXN_LOCK(); } else { /* We can't get the commit mutex. It may * be held by a thread waiting for tlock's * so let's not block here. Save it to * put back on the anon_list. */ /* Move from anon_list to anon_list2 */ list_move(&jfs_ip->anon_inode_list, &TxAnchor.anon_list2); TXN_UNLOCK(); iput(ip); TXN_LOCK(); } } /* Add anon_list2 back to anon_list */ list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); if (freezing(current)) { TXN_UNLOCK(); try_to_freeze(); } else { set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); schedule(); } } while (!kthread_should_stop()); jfs_info("jfs_sync being killed"); return 0; } #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) int jfs_txanchor_proc_show(struct seq_file *m, void *v) { char *freewait; char *freelockwait; char *lowlockwait; freewait = waitqueue_active(&TxAnchor.freewait) ? "active" : "empty"; freelockwait = waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty"; lowlockwait = waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; seq_printf(m, "JFS TxAnchor\n" "============\n" "freetid = %d\n" "freewait = %s\n" "freelock = %d\n" "freelockwait = %s\n" "lowlockwait = %s\n" "tlocksInUse = %d\n" "jfs_tlocks_low = %d\n" "unlock_queue is %sempty\n", TxAnchor.freetid, freewait, TxAnchor.freelock, freelockwait, lowlockwait, TxAnchor.tlocksInUse, jfs_tlocks_low, list_empty(&TxAnchor.unlock_queue) ? "" : "not "); return 0; } #endif #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) int jfs_txstats_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS TxStats\n" "===========\n" "calls to txBegin = %d\n" "txBegin blocked by sync barrier = %d\n" "txBegin blocked by tlocks low = %d\n" "txBegin blocked by no free tid = %d\n" "calls to txBeginAnon = %d\n" "txBeginAnon blocked by sync barrier = %d\n" "txBeginAnon blocked by tlocks low = %d\n" "calls to txLockAlloc = %d\n" "tLockAlloc blocked by no free lock = %d\n", TxStat.txBegin, TxStat.txBegin_barrier, TxStat.txBegin_lockslow, TxStat.txBegin_freetid, TxStat.txBeginAnon, TxStat.txBeginAnon_barrier, TxStat.txBeginAnon_lockslow, TxStat.txLockAlloc, TxStat.txLockAlloc_freelock); return 0; } #endif
62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0 #include <linux/sysctl.h> #include <net/lwtunnel.h> #include <net/netfilter/nf_hooks_lwtunnel.h> #include <linux/netfilter.h> #include "nf_internals.h" static inline int nf_hooks_lwtunnel_get(void) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return 1; else return 0; } static inline int nf_hooks_lwtunnel_set(int enable) { if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) { if (!enable) return -EBUSY; } else if (enable) { static_branch_enable(&nf_hooks_lwtunnel_enabled); } return 0; } #ifdef CONFIG_SYSCTL int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int proc_nf_hooks_lwtunnel_enabled = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_nf_hooks_lwtunnel_enabled, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get(); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled); return ret; } EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); static struct ctl_table nf_lwtunnel_sysctl_table[] = { { .procname = "nf_hooks_lwtunnel", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = nf_hooks_lwtunnel_sysctl_handler, }, }; static int __net_init nf_lwtunnel_net_init(struct net *net) { struct ctl_table_header *hdr; struct ctl_table *table; table = nf_lwtunnel_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(nf_lwtunnel_sysctl_table, sizeof(nf_lwtunnel_sysctl_table), GFP_KERNEL); if (!table) goto err_alloc; } hdr = register_net_sysctl_sz(net, "net/netfilter", table, ARRAY_SIZE(nf_lwtunnel_sysctl_table)); if (!hdr) goto err_reg; net->nf.nf_lwtnl_dir_header = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit nf_lwtunnel_net_exit(struct net *net) { const struct ctl_table *table; table = net->nf.nf_lwtnl_dir_header->ctl_table_arg; unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header); if (!net_eq(net, &init_net)) kfree(table); } static struct pernet_operations nf_lwtunnel_net_ops = { .init = nf_lwtunnel_net_init, .exit = nf_lwtunnel_net_exit, }; int __init netfilter_lwtunnel_init(void) { return register_pernet_subsys(&nf_lwtunnel_net_ops); } void netfilter_lwtunnel_fini(void) { unregister_pernet_subsys(&nf_lwtunnel_net_ops); } #else int __init netfilter_lwtunnel_init(void) { return 0; } void netfilter_lwtunnel_fini(void) {} #endif /* CONFIG_SYSCTL */
396 396 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 // SPDX-License-Identifier: GPL-2.0 #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { ASSERT_RTNL(); info->family = AF_INET; net->ipv4.fib_seq++; return call_fib_notifiers(net, event_type, info); } static unsigned int fib4_seq_read(struct net *net) { ASSERT_RTNL(); return net->ipv4.fib_seq + fib4_rules_seq_read(net); } static int fib4_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib4_rules_dump(net, nb, extack); if (err) return err; return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { .family = AF_INET, .fib_seq_read = fib4_seq_read, .fib_dump = fib4_dump, .owner = THIS_MODULE, }; int __net_init fib4_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.fib_seq = 0; ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.notifier_ops = ops; return 0; } void __net_exit fib4_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.notifier_ops); }
39 39 47 48 48 47 5 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". * * This is used to derive keys from the fscrypt master keys. * * Copyright 2019 Google LLC */ #include <crypto/hash.h> #include <crypto/sha2.h> #include "fscrypt_private.h" /* * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses * SHA-512 because it is well-established, secure, and reasonably efficient. * * HKDF-SHA256 was also considered, as its 256-bit security strength would be * sufficient here. A 512-bit security strength is "nice to have", though. * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the * common case of deriving an AES-256-XTS key (512 bits), that can result in * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ #define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE /* * HKDF consists of two steps: * * 1. HKDF-Extract: extract a pseudorandom key of length HKDF_HASHLEN bytes from * the input keying material and optional salt. * 2. HKDF-Expand: expand the pseudorandom key into output keying material of * any length, parameterized by an application-specific info string. * * HKDF-Extract can be skipped if the input is already a pseudorandom key of * length HKDF_HASHLEN bytes. However, cipher modes other than AES-256-XTS take * shorter keys, and we don't want to force users of those modes to provide * unnecessarily long master keys. Thus fscrypt still does HKDF-Extract. No * salt is used, since fscrypt master keys should already be pseudorandom and * there's no way to persist a random salt per master key from kernel mode. */ /* HKDF-Extract (RFC 5869 section 2.2), unsalted */ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) { static const u8 default_salt[HKDF_HASHLEN]; int err; err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); if (err) return err; return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); } /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. * * Afterwards, the keyed HMAC transform object can be used for HKDF-Expand many * times without having to recompute HKDF-Extract each time. */ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; u8 prk[HKDF_HASHLEN]; int err; hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0); if (IS_ERR(hmac_tfm)) { fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld", PTR_ERR(hmac_tfm)); return PTR_ERR(hmac_tfm); } if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { err = -EINVAL; goto err_free_tfm; } err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); if (err) goto err_free_tfm; err = crypto_shash_setkey(hmac_tfm, prk, sizeof(prk)); if (err) goto err_free_tfm; hkdf->hmac_tfm = hmac_tfm; goto out; err_free_tfm: crypto_free_shash(hmac_tfm); out: memzero_explicit(prk, sizeof(prk)); return err; } /* * HKDF-Expand (RFC 5869 section 2.3). This expands the pseudorandom key, which * was already keyed into 'hkdf->hmac_tfm' by fscrypt_init_hkdf(), into 'okmlen' * bytes of output keying material parameterized by the application-specific * 'info' of length 'infolen' bytes, prefixed by "fscrypt\0" and the 'context' * byte. This is thread-safe and may be called by multiple threads in parallel. * * ('context' isn't part of the HKDF specification; it's just a prefix fscrypt * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); u8 prefix[9]; unsigned int i; int err; const u8 *prev = NULL; u8 counter = 1; u8 tmp[HKDF_HASHLEN]; if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) return -EINVAL; desc->tfm = hkdf->hmac_tfm; memcpy(prefix, "fscrypt\0", 8); prefix[8] = context; for (i = 0; i < okmlen; i += HKDF_HASHLEN) { err = crypto_shash_init(desc); if (err) goto out; if (prev) { err = crypto_shash_update(desc, prev, HKDF_HASHLEN); if (err) goto out; } err = crypto_shash_update(desc, prefix, sizeof(prefix)); if (err) goto out; err = crypto_shash_update(desc, info, infolen); if (err) goto out; BUILD_BUG_ON(sizeof(counter) != 1); if (okmlen - i < HKDF_HASHLEN) { err = crypto_shash_finup(desc, &counter, 1, tmp); if (err) goto out; memcpy(&okm[i], tmp, okmlen - i); memzero_explicit(tmp, sizeof(tmp)); } else { err = crypto_shash_finup(desc, &counter, 1, &okm[i]); if (err) goto out; } counter++; prev = &okm[i]; } err = 0; out: if (unlikely(err)) memzero_explicit(okm, okmlen); /* so caller doesn't need to */ shash_desc_zero(desc); return err; } void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf) { crypto_free_shash(hkdf->hmac_tfm); }
426 429 162 171 129 414 6 87 292 94 94 259 280 533 532 151 439 14 62 403 403 3 153 440 330 330 326 330 329 328 330 121 51 6 63 34 84 6 103 102 68 8 56 7 101 103 6 96 61 61 17 185 153 44 44 44 154 104 103 100 11 2 8 101 101 101 6 101 6 36 332 332 332 330 296 36 332 295 25 1 1 2 2 1 5 87 86 87 85 61 28 28 69 22 69 33 33 465 464 168 97 89 98 266 1 3 12 1 7 5 188 536 86 96 96 54 44 227 160 476 475 474 193 294 114 114 114 22 100 33 113 111 114 111 16 382 248 35 215 424 147 295 296 297 297 297 297 296 297 297 297 295 295 297 250 104 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET transport hashtables * * Authors: Lotsa people, from code originally in tcp */ #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/inet6_hashtables.h> #endif #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/ip.h> #include <net/tcp.h> #include <net/sock_reuseport.h> u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. */ static u32 sk_ehashfn(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return inet6_ehashfn(sock_net(sk), &sk->sk_v6_rcv_saddr, sk->sk_num, &sk->sk_v6_daddr, sk->sk_dport); #endif return inet_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); } /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); hlist_add_head(&tb->node, &head->chain); } return tb; } /* * Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev) { return net_eq(ib_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev; } static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk) { write_pnet(&tb2->ib_net, net); tb2->l3mdev = tb->l3mdev; tb2->port = tb->port; #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); if (sk->sk_family == AF_INET6) { tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; } else { tb2->addr_type = IPV6_ADDR_MAPPED; ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); } #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk) { struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb2) inet_bind2_bucket_init(tb2, net, head, tb, sk); return tb2; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); } } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); if (tb2->addr_type != IPV6_ADDR_MAPPED) return false; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { inet_sk(sk)->inet_num = port; inet_csk(sk)->icsk_bind_hash = tb; inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); } /* * Get rid of any references to a local port held by the given sock. */ static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; int bhash; bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); head = &hashinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind2_hash = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); } spin_unlock(&head2->lock); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } void inet_put_port(struct sock *sk) { local_bh_disable(); __inet_put_port(sk); local_bh_enable(); } EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; struct net *net = sock_net(sk); bool update_fastreuse = false; struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; int bhash, l3mdev; bhash = inet_bhashfn(net, port, table->bhash_size); head = &table->bhash[bhash]; head2 = inet_bhashfn_portaddr(table, child, net, port); spin_lock(&head->lock); spin_lock(&head2->lock); tb = inet_csk(sk)->icsk_bind_hash; tb2 = inet_csk(sk)->icsk_bind2_hash; if (unlikely(!tb || !tb2)) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOENT; } if (tb->port != port) { l3mdev = inet_sk_bound_l3mdev(sk); /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } created_inet_bind_bucket = true; } update_fastreuse = true; goto bhash2_find; } else if (!inet_bind2_bucket_addr_match(tb2, child)) { l3mdev = inet_sk_bound_l3mdev(sk); bhash2_find: tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); if (!tb2) { tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, net, head2, tb, child); if (!tb2) goto error; } } if (update_fastreuse) inet_csk_update_fastreuse(tb, child); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); return 0; error: if (created_inet_bind_bucket) inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; } EXPORT_SYMBOL_GPL(__inet_inherit_port); static struct inet_listen_hashbucket * inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); else #endif hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); return inet_lhash2_bucket(h, hash); } static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (sk->sk_family == PF_INET) score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. * @net: network namespace. * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ /* called with rcu_read_lock() : No refcount taken on the socket */ static struct sock *inet_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && hashinfo == net->ipv4.tcp_death_row.hashinfo) { result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet_ehashfn); if (result) goto done; } hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with INADDR_ANY */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); /* All sockets share common refcount, but have different destructors */ void sock_gen_put(struct sock *sk) { if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_free(inet_twsk(sk)); else if (sk->sk_state == TCP_NEW_SYN_RECV) reqsk_free(inet_reqsk(sk)); else sk_free(sk); } EXPORT_SYMBOL_GPL(sock_gen_put); void sock_edemux(struct sk_buff *skb) { sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif) { INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet_match(net, sk, acookie, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, struct inet_timewait_sock **twp) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (sk->sk_protocol == IPPROTO_TCP && tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); } /* Searches for an exsiting socket in the ehash bucket list. * Returns true if found, false otherwise. */ static bool inet_ehash_lookup_by_sk(struct sock *sk, struct hlist_nulls_head *list) { const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); const int sdif = sk->sk_bound_dev_if; const int dif = sk->sk_bound_dev_if; const struct hlist_nulls_node *node; struct net *net = sock_net(sk); struct sock *esk; INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); sk_nulls_for_each_rcu(esk, node, list) { if (esk->sk_hash != sk->sk_hash) continue; if (sk->sk_family == AF_INET) { if (unlikely(inet_match(net, esk, acookie, ports, dif, sdif))) { return true; } } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (unlikely(inet6_match(net, esk, &sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr, ports, dif, sdif))) { return true; } } #endif } return false; } /* Insert a socket into ehash, and eventually remove another one * (The another one can be a SYN_RECV or TIMEWAIT) * If an existing socket already exists, socket sk is not inserted, * and sets found_dup_sk parameter to true. */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; bool ret = true; WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); } else if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; } if (ret) __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); return ret; } bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } return ok; } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, osk, NULL); local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); spin_lock(&ilb2->lock); if (sk->sk_reuseport) { err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } sock_set_flag(sk, SOCK_RCU_FREE); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); return err; } EXPORT_SYMBOL(__inet_hash); int inet_hash(struct sock *sk) { int err = 0; if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); return err; } EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); if (sk_unhashed(sk)) { spin_unlock(&ilb2->lock); return; } if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&ilb2->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); if (sk_unhashed(sk)) { spin_unlock_bh(lock); return; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); } } EXPORT_SYMBOL_GPL(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; return inet_bind2_bucket_addr_match(tb, sk); } bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { if (!net_eq(ib2_net(tb), net) || tb->port != port || tb->l3mdev != l3mdev) return false; #if IS_ENABLED(CONFIG_IPV6) if (tb->addr_type == IPV6_ADDR_ANY) return true; if (tb->addr_type != IPV6_ADDR_MAPPED) return false; if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return false; #endif return tb->rcv_saddr == 0; } /* The socket's bhash2 hashbucket spinlock must be held when this is called */ struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk) { struct inet_bind2_bucket *bhash2 = NULL; inet_bind_bucket_for_each(bhash2, &head->chain) if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) break; return bhash2; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &in6addr_any, port); else #endif hash = ipv4_portaddr_hash(net, 0, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } static void inet_update_saddr(struct sock *sk, void *saddr, int family) { if (family == AF_INET) { inet_sk(sk)->inet_saddr = *(__be32 *)saddr; sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); } #if IS_ENABLED(CONFIG_IPV6) else { sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; } #endif } static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); int bhash; if (!inet_csk(sk)->icsk_bind2_hash) { /* Not bind()ed before. */ if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); return 0; } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); if (!new_tb2) { if (reset) { /* The (INADDR_ANY, port) bucket might have already * been freed, then we cannot fixup icsk_bind2_hash, * so we give up and unlink sk from bhash/bhash2 not * to leave inconsistency in bhash2. */ inet_put_port(sk); inet_reset_saddr(sk); } return -ENOMEM; } bhash = inet_bhashfn(net, port, hinfo->bhash_size); head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); /* If we change saddr locklessly, another thread * iterating over bhash might see corrupted address. */ spin_lock_bh(&head->lock); spin_lock(&head2->lock); __sk_del_bind_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); if (reset) inet_reset_saddr(sk); else inet_update_saddr(sk, saddr, family); head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); return 0; } int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though * attacks were since demonstrated, thus we use 65536 by default instead * to really give more isolation and privacy, at the expense of 256kB * of kernel memory. */ #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; struct inet_timewait_sock *tw = NULL; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; bool tb_created = false; u32 remaining, offset; int ret, i, low, high; bool local_ports; int step, l3mdev; u32 index; if (port) { local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; } l3mdev = inet_sk_bound_l3mdev(sk); local_ports = inet_sk_get_local_port_range(sk, &low, &high); step = local_ports ? 1 : 2; high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; if (!local_ports && remaining > 1) remaining &= ~1U; get_random_sleepable_once(table_perturb, INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); offset %= remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ if (!local_ports) offset &= ~1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += step, port += step) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, port, &tw)) goto ok; goto next_port; } } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; } tb_created = true; tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; next_port: spin_unlock_bh(&head->lock); cond_resched(); } if (!local_ports) { offset++; if ((offset & 1) && remaining > 1) goto other_parity_scan; } return -EADDRNOTAVAIL; ok: /* Find the corresponding tb2 bucket since we need to * add the socket to the bhash2 table as well */ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, tb, sk); if (!tb2) goto error; } /* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention * it may be inexistent. */ i = max_t(int, i, get_random_u32_below(8) * step); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head2->lock); spin_unlock(&head->lock); if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; error: if (sk_hashed(sk)) { spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(lock); __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); sk->sk_hash = 0; inet_sk(sk)->inet_sport = 0; inet_sk(sk)->inet_num = 0; if (tw) inet_twsk_bind_unhash(tw, hinfo); } spin_unlock(&head2->lock); if (tb_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); return -ENOMEM; } /* * Bind a port for a connect operation and hash it. */ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, i + LISTENING_NULLS_BASE); } } void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, scale, 0, NULL, &h->lhash2_mask, low_limit, high_limit); init_hashinfo_lhash2(h); /* this one is used for source ports of outgoing connections */ table_perturb = alloc_large_system_hash("Table-perturb", sizeof(*table_perturb), INET_TABLE_PERTURB_SIZE, 0, 0, NULL, NULL, INET_TABLE_PERTURB_SIZE, INET_TABLE_PERTURB_SIZE); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) { h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); if (!h->lhash2) return -ENOMEM; h->lhash2_mask = INET_LHTABLE_SIZE - 1; /* INET_LHTABLE_SIZE must be a power of 2 */ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); init_hashinfo_lhash2(h); return 0; } EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; if (locksz != 0) { /* allocate 2 cache lines or at least one spinlock per cpu */ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; for (i = 0; i < nblocks; i++) spin_lock_init(&hashinfo->ehash_locks[i]); } hashinfo->ehash_locks_mask = nblocks - 1; return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) { struct inet_hashinfo *new_hashinfo; int i; new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); if (!new_hashinfo) goto err; new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), GFP_KERNEL_ACCOUNT); if (!new_hashinfo->ehash) goto free_hashinfo; new_hashinfo->ehash_mask = ehash_entries - 1; if (inet_ehash_locks_alloc(new_hashinfo)) goto free_ehash; for (i = 0; i < ehash_entries; i++) INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); new_hashinfo->pernet = true; return new_hashinfo; free_ehash: vfree(new_hashinfo->ehash); free_hashinfo: kfree(new_hashinfo); err: return NULL; } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { if (!hashinfo->pernet) return; inet_ehash_locks_free(hashinfo); vfree(hashinfo->ehash); kfree(hashinfo); } EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
76 66 158 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #ifndef _EXFAT_FS_H #define _EXFAT_FS_H #include <linux/fs.h> #include <linux/ratelimit.h> #include <linux/nls.h> #include <linux/blkdev.h> #define EXFAT_ROOT_INO 1 #define EXFAT_CLUSTERS_UNTRACKED (~0u) /* * exfat error flags */ enum exfat_error_mode { EXFAT_ERRORS_CONT, /* ignore error and continue */ EXFAT_ERRORS_PANIC, /* panic on error */ EXFAT_ERRORS_RO, /* remount r/o on error */ }; /* * exfat nls lossy flag */ enum { NLS_NAME_NO_LOSSY = 0, /* no lossy */ NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */ NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */ }; #define EXFAT_HASH_BITS 8 #define EXFAT_HASH_SIZE (1UL << EXFAT_HASH_BITS) /* * Type Definitions */ #define ES_2_ENTRIES 2 #define ES_ALL_ENTRIES 0 #define ES_IDX_FILE 0 #define ES_IDX_STREAM 1 #define ES_IDX_FIRST_FILENAME 2 #define EXFAT_FILENAME_ENTRY_NUM(name_len) \ DIV_ROUND_UP(name_len, EXFAT_FILE_NAME_LEN) #define ES_IDX_LAST_FILENAME(name_len) \ (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1) #define DIR_DELETED 0xFFFFFFF7 /* type values */ #define TYPE_UNUSED 0x0000 #define TYPE_DELETED 0x0001 #define TYPE_INVALID 0x0002 #define TYPE_CRITICAL_PRI 0x0100 #define TYPE_BITMAP 0x0101 #define TYPE_UPCASE 0x0102 #define TYPE_VOLUME 0x0103 #define TYPE_DIR 0x0104 #define TYPE_FILE 0x011F #define TYPE_CRITICAL_SEC 0x0200 #define TYPE_STREAM 0x0201 #define TYPE_EXTEND 0x0202 #define TYPE_ACL 0x0203 #define TYPE_BENIGN_PRI 0x0400 #define TYPE_GUID 0x0401 #define TYPE_PADDING 0x0402 #define TYPE_ACLTAB 0x0403 #define TYPE_BENIGN_SEC 0x0800 #define TYPE_VENDOR_EXT 0x0801 #define TYPE_VENDOR_ALLOC 0x0802 #define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */ #define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ #define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE) #define EXFAT_HINT_NONE -1 #define EXFAT_MIN_SUBDIR 2 /* * helpers for cluster size to byte conversion. */ #define EXFAT_CLU_TO_B(b, sbi) ((b) << (sbi)->cluster_size_bits) #define EXFAT_B_TO_CLU(b, sbi) ((b) >> (sbi)->cluster_size_bits) #define EXFAT_B_TO_CLU_ROUND_UP(b, sbi) \ (((b - 1) >> (sbi)->cluster_size_bits) + 1) #define EXFAT_CLU_OFFSET(off, sbi) ((off) & ((sbi)->cluster_size - 1)) /* * helpers for block size to byte conversion. */ #define EXFAT_BLK_TO_B(b, sb) ((b) << (sb)->s_blocksize_bits) #define EXFAT_B_TO_BLK(b, sb) ((b) >> (sb)->s_blocksize_bits) #define EXFAT_B_TO_BLK_ROUND_UP(b, sb) \ (((b - 1) >> (sb)->s_blocksize_bits) + 1) #define EXFAT_BLK_OFFSET(off, sb) ((off) & ((sb)->s_blocksize - 1)) /* * helpers for block size to dentry size conversion. */ #define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS) #define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS) /* * helpers for cluster size to dentry size conversion. */ #define EXFAT_CLU_TO_DEN(clu, sbi) \ ((clu) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) #define EXFAT_DEN_TO_CLU(dentry, sbi) \ ((dentry) >> ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) /* * helpers for fat entry. */ #define FAT_ENT_SIZE (4) #define FAT_ENT_SIZE_BITS (2) #define FAT_ENT_OFFSET_SECTOR(sb, loc) (EXFAT_SB(sb)->FAT1_start_sector + \ (((u64)loc << FAT_ENT_SIZE_BITS) >> sb->s_blocksize_bits)) #define FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc) \ ((loc << FAT_ENT_SIZE_BITS) & (sb->s_blocksize - 1)) /* * helpers for bitmap. */ #define CLUSTER_TO_BITMAP_ENT(clu) ((clu) - EXFAT_RESERVED_CLUSTERS) #define BITMAP_ENT_TO_CLUSTER(ent) ((ent) + EXFAT_RESERVED_CLUSTERS) #define BITS_PER_SECTOR(sb) ((sb)->s_blocksize * BITS_PER_BYTE) #define BITS_PER_SECTOR_MASK(sb) (BITS_PER_SECTOR(sb) - 1) #define BITMAP_OFFSET_SECTOR_INDEX(sb, ent) \ ((ent / BITS_PER_BYTE) >> (sb)->s_blocksize_bits) #define BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent) (ent & BITS_PER_SECTOR_MASK(sb)) #define BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent) \ ((ent / BITS_PER_BYTE) & ((sb)->s_blocksize - 1)) #define IGNORED_BITS_REMAINED(clu, clu_base) ((1UL << ((clu) - (clu_base))) - 1) #define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1) /* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */ #define ES_MAX_ENTRY_NUM ES_ENTRY_NUM(MAX_NAME_LENGTH) /* * 19 entries x 32 bytes/entry = 608 bytes. * The 608 bytes are in 3 sectors at most (even 512 Byte sector). */ #define DIR_CACHE_SIZE \ (DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1) struct exfat_dentry_namebuf { char *lfn; int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */ }; /* unicode name structure */ struct exfat_uni_name { /* +3 for null and for converting */ unsigned short name[MAX_NAME_LENGTH + 3]; u16 name_hash; unsigned char name_len; }; /* directory structure */ struct exfat_chain { unsigned int dir; unsigned int size; unsigned char flags; }; /* first empty entry hint information */ struct exfat_hint_femp { /* entry index of a directory */ int eidx; /* count of continuous empty entry */ int count; /* the cluster that first empty slot exists in */ struct exfat_chain cur; }; /* hint structure */ struct exfat_hint { unsigned int clu; union { unsigned int off; /* cluster offset */ int eidx; /* entry index */ }; }; struct exfat_entry_set_cache { struct super_block *sb; unsigned int start_off; int num_bh; struct buffer_head *__bh[DIR_CACHE_SIZE]; struct buffer_head **bh; unsigned int num_entries; bool modified; }; #define IS_DYNAMIC_ES(es) ((es)->__bh != (es)->bh) struct exfat_dir_entry { struct exfat_chain dir; int entry; unsigned int type; unsigned int start_clu; unsigned char flags; unsigned short attr; loff_t size; loff_t valid_size; unsigned int num_subdirs; struct timespec64 atime; struct timespec64 mtime; struct timespec64 crtime; struct exfat_dentry_namebuf namebuf; }; /* * exfat mount in-memory data */ struct exfat_mount_options { kuid_t fs_uid; kgid_t fs_gid; unsigned short fs_fmask; unsigned short fs_dmask; /* permission for setting the [am]time */ unsigned short allow_utime; /* charset for filename input/display */ char *iocharset; /* on error: continue, panic, remount-ro */ enum exfat_error_mode errors; unsigned utf8:1, /* Use of UTF-8 character set */ sys_tz:1, /* Use local timezone */ discard:1, /* Issue discard requests on deletions */ keep_last_dots:1; /* Keep trailing periods in paths */ int time_offset; /* Offset of timestamps from UTC (in minutes) */ /* Support creating zero-size directory, default: false */ bool zero_size_dir; }; /* * EXFAT file system superblock in-memory data */ struct exfat_sb_info { unsigned long long num_sectors; /* num of sectors in volume */ unsigned int num_clusters; /* num of clusters in volume */ unsigned int cluster_size; /* cluster size in bytes */ unsigned int cluster_size_bits; unsigned int sect_per_clus; /* cluster size in sectors */ unsigned int sect_per_clus_bits; unsigned long long FAT1_start_sector; /* FAT1 start sector */ unsigned long long FAT2_start_sector; /* FAT2 start sector */ unsigned long long data_start_sector; /* data area start sector */ unsigned int num_FAT_sectors; /* num of FAT sectors */ unsigned int root_dir; /* root dir cluster */ unsigned int dentries_per_clu; /* num of dentries per cluster */ unsigned int vol_flags; /* volume flags */ unsigned int vol_flags_persistent; /* volume flags to retain */ struct buffer_head *boot_bh; /* buffer_head of BOOT sector */ unsigned int map_clu; /* allocation bitmap start cluster */ unsigned int map_sectors; /* num of allocation bitmap sectors */ struct buffer_head **vol_amap; /* allocation bitmap */ unsigned short *vol_utbl; /* upcase table */ unsigned int clu_srch_ptr; /* cluster search pointer */ unsigned int used_clusters; /* number of used clusters */ struct mutex s_lock; /* superblock lock */ struct mutex bitmap_lock; /* bitmap lock */ struct exfat_mount_options options; struct nls_table *nls_io; /* Charset used for input and display */ struct ratelimit_state ratelimit; spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 /* * EXFAT file system inode in-memory data */ struct exfat_inode_info { struct exfat_chain dir; int entry; unsigned int type; unsigned short attr; unsigned int start_clu; unsigned char flags; /* * the copy of low 32bit of i_version to check * the validation of hint_stat. */ unsigned int version; /* hint for cluster last accessed */ struct exfat_hint hint_bmap; /* hint for entry index we try to lookup next time */ struct exfat_hint hint_stat; /* hint for first empty entry */ struct exfat_hint_femp hint_femp; spinlock_t cache_lru_lock; struct list_head cache_lru; int nr_caches; /* for avoiding the race between alloc and free */ unsigned int cache_valid_id; /* * NOTE: i_size_ondisk is 64bits, so must hold ->inode_lock to access. * physically allocated size. */ loff_t i_size_ondisk; /* block-aligned i_size (used in cont_write_begin) */ loff_t i_size_aligned; /* on-disk position of directory entry or 0 */ loff_t i_pos; loff_t valid_size; /* hash by i_location */ struct hlist_node i_hash_fat; /* protect bmap against truncate */ struct rw_semaphore truncate_lock; struct inode vfs_inode; /* File creation time */ struct timespec64 i_crtime; }; static inline struct exfat_sb_info *EXFAT_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct exfat_inode_info *EXFAT_I(struct inode *inode) { return container_of(inode, struct exfat_inode_info, vfs_inode); } /* * If ->i_mode can't hold 0222 (i.e. ATTR_RO), we use ->i_attrs to * save ATTR_RO instead of ->i_mode. * * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only * bit, it's just used as flag for app. */ static inline int exfat_mode_can_hold_ro(struct inode *inode) { struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); if (S_ISDIR(inode->i_mode)) return 0; if ((~sbi->options.fs_fmask) & 0222) return 1; return 0; } /* Convert attribute bits and a mask to the UNIX mode. */ static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi, unsigned short attr, mode_t mode) { if ((attr & EXFAT_ATTR_READONLY) && !(attr & EXFAT_ATTR_SUBDIR)) mode &= ~0222; if (attr & EXFAT_ATTR_SUBDIR) return (mode & ~sbi->options.fs_dmask) | S_IFDIR; return (mode & ~sbi->options.fs_fmask) | S_IFREG; } /* Return the FAT attribute byte for this inode */ static inline unsigned short exfat_make_attr(struct inode *inode) { unsigned short attr = EXFAT_I(inode)->attr; if (S_ISDIR(inode->i_mode)) attr |= EXFAT_ATTR_SUBDIR; if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & 0222)) attr |= EXFAT_ATTR_READONLY; return attr; } static inline void exfat_save_attr(struct inode *inode, unsigned short attr) { if (exfat_mode_can_hold_ro(inode)) EXFAT_I(inode)->attr = attr & (EXFAT_ATTR_RWMASK | EXFAT_ATTR_READONLY); else EXFAT_I(inode)->attr = attr & EXFAT_ATTR_RWMASK; } static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi, sector_t sec) { return ((sec - sbi->data_start_sector + 1) & ((1 << sbi->sect_per_clus_bits) - 1)) == 0; } static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi, unsigned int clus) { return ((sector_t)(clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) + sbi->data_start_sector; } static inline unsigned int exfat_sector_to_cluster(struct exfat_sb_info *sbi, sector_t sec) { return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) + EXFAT_RESERVED_CLUSTERS; } static inline bool is_valid_cluster(struct exfat_sb_info *sbi, unsigned int clus) { return clus >= EXFAT_FIRST_CLUSTER && clus < sbi->num_clusters; } /* super.c */ int exfat_set_volume_dirty(struct super_block *sb); int exfat_clear_volume_dirty(struct super_block *sb); /* fatent.c */ #define exfat_get_next_cluster(sb, pclu) exfat_ent_get(sb, *(pclu), pclu) int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, struct exfat_chain *p_chain, bool sync_bmap); int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain); int exfat_ent_get(struct super_block *sb, unsigned int loc, unsigned int *content); int exfat_ent_set(struct super_block *sb, unsigned int loc, unsigned int content); int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, unsigned int len); int exfat_zeroed_cluster(struct inode *dir, unsigned int clu); int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, unsigned int *ret_clu); int exfat_count_num_clusters(struct super_block *sb, struct exfat_chain *p_chain, unsigned int *ret_count); /* balloc.c */ int exfat_load_bitmap(struct super_block *sb); void exfat_free_bitmap(struct exfat_sb_info *sbi); int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync); void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); /* file.c */ extern const struct file_operations exfat_file_operations; int __exfat_truncate(struct inode *inode); void exfat_truncate(struct inode *inode); int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags); int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long exfat_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* namei.c */ extern const struct dentry_operations exfat_dentry_ops; extern const struct dentry_operations exfat_utf8_dentry_ops; /* cache.c */ int exfat_cache_init(void); void exfat_cache_shutdown(void); void exfat_cache_inval_inode(struct inode *inode); int exfat_get_cluster(struct inode *inode, unsigned int cluster, unsigned int *fclus, unsigned int *dclus, unsigned int *last_dclus, int allow_eof); /* dir.c */ extern const struct inode_operations exfat_dir_inode_operations; extern const struct file_operations exfat_dir_operations; unsigned int exfat_get_entry_type(struct exfat_dentry *p_entry); void exfat_init_dir_entry(struct exfat_entry_set_cache *es, unsigned int type, unsigned int start_clu, unsigned long long size, struct timespec64 *ts); void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries, struct exfat_uni_name *p_uniname); void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es, int order); void exfat_update_dir_chksum(struct exfat_entry_set_cache *es); int exfat_calc_num_entries(struct exfat_uni_name *p_uniname); int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, struct exfat_hint *hint_opt); int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); struct exfat_dentry *exfat_get_dentry(struct super_block *sb, struct exfat_chain *p_dir, int entry, struct buffer_head **bh); struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es, int num); int exfat_get_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries); int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es, struct super_block *sb, struct exfat_chain *p_dir, int entry, unsigned int num_entries); int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync); int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); /* inode.c */ extern const struct inode_operations exfat_file_inode_operations; void exfat_sync_inode(struct inode *inode); struct inode *exfat_build_inode(struct super_block *sb, struct exfat_dir_entry *info, loff_t i_pos); void exfat_hash_inode(struct inode *inode, loff_t i_pos); void exfat_unhash_inode(struct inode *inode); struct inode *exfat_iget(struct super_block *sb, loff_t i_pos); int __exfat_write_inode(struct inode *inode, int sync); int exfat_write_inode(struct inode *inode, struct writeback_control *wbc); void exfat_evict_inode(struct inode *inode); int exfat_block_truncate_page(struct inode *inode, loff_t from); /* exfat/nls.c */ unsigned short exfat_toupper(struct super_block *sb, unsigned short a); int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a, unsigned short *b, unsigned int len); int exfat_utf16_to_nls(struct super_block *sb, struct exfat_uni_name *uniname, unsigned char *p_cstring, int len); int exfat_nls_to_utf16(struct super_block *sb, const unsigned char *p_cstring, const int len, struct exfat_uni_name *uniname, int *p_lossy); int exfat_create_upcase_table(struct super_block *sb); void exfat_free_upcase_table(struct exfat_sb_info *sbi); /* exfat/misc.c */ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) __printf(3, 4) __cold; #define exfat_fs_error(sb, fmt, args...) \ __exfat_fs_error(sb, 1, fmt, ## args) #define exfat_fs_error_ratelimit(sb, fmt, args...) \ __exfat_fs_error(sb, __ratelimit(&EXFAT_SB(sb)->ratelimit), \ fmt, ## args) /* expand to pr_*() with prefix */ #define exfat_err(sb, fmt, ...) \ pr_err("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) #define exfat_warn(sb, fmt, ...) \ pr_warn("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) #define exfat_info(sb, fmt, ...) \ pr_info("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) #define exfat_debug(sb, fmt, ...) \ pr_debug("exFAT-fs (%s): " fmt "\n", (sb)->s_id, ##__VA_ARGS__) void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_cs); void exfat_truncate_atime(struct timespec64 *ts); void exfat_truncate_inode_atime(struct inode *inode); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_cs); u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type); u32 exfat_calc_chksum32(void *data, int len, u32 chksum, int type); void exfat_update_bh(struct buffer_head *bh, int sync); int exfat_update_bhs(struct buffer_head **bhs, int nr_bhs, int sync); void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, unsigned int size, unsigned char flags); void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec); #endif /* !_EXFAT_FS_H */
10 10 2 2 3 3 9 9 56 58 56 2 2 1 1 14 14 14 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 // SPDX-License-Identifier: GPL-2.0-only /* * Media device node * * Copyright (C) 2010 Nokia Corporation * * Based on drivers/media/video/v4l2_dev.c code authored by * Mauro Carvalho Chehab <mchehab@kernel.org> (version 2) * Alan Cox, <alan@lxorguk.ukuu.org.uk> (version 1) * * Contacts: Laurent Pinchart <laurent.pinchart@ideasonboard.com> * Sakari Ailus <sakari.ailus@iki.fi> * * -- * * Generic media device node infrastructure to register and unregister * character devices using a dynamic major number and proper reference * counting. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> #include <linux/uaccess.h> #include <media/media-devnode.h> #include <media/media-device.h> #define MEDIA_NUM_DEVICES 256 #define MEDIA_NAME "media" static dev_t media_dev_t; /* * Active devices */ static DEFINE_MUTEX(media_devnode_lock); static DECLARE_BITMAP(media_devnode_nums, MEDIA_NUM_DEVICES); /* Called when the last user of the media device exits. */ static void media_devnode_release(struct device *cd) { struct media_devnode *devnode = to_media_devnode(cd); mutex_lock(&media_devnode_lock); /* Mark device node number as free */ clear_bit(devnode->minor, media_devnode_nums); mutex_unlock(&media_devnode_lock); /* Release media_devnode and perform other cleanups as needed. */ if (devnode->release) devnode->release(devnode); kfree(devnode); pr_debug("%s: Media Devnode Deallocated\n", __func__); } static const struct bus_type media_bus_type = { .name = MEDIA_NAME, }; static ssize_t media_read(struct file *filp, char __user *buf, size_t sz, loff_t *off) { struct media_devnode *devnode = media_devnode_data(filp); if (!devnode->fops->read) return -EINVAL; if (!media_devnode_is_registered(devnode)) return -EIO; return devnode->fops->read(filp, buf, sz, off); } static ssize_t media_write(struct file *filp, const char __user *buf, size_t sz, loff_t *off) { struct media_devnode *devnode = media_devnode_data(filp); if (!devnode->fops->write) return -EINVAL; if (!media_devnode_is_registered(devnode)) return -EIO; return devnode->fops->write(filp, buf, sz, off); } static __poll_t media_poll(struct file *filp, struct poll_table_struct *poll) { struct media_devnode *devnode = media_devnode_data(filp); if (!media_devnode_is_registered(devnode)) return EPOLLERR | EPOLLHUP; if (!devnode->fops->poll) return DEFAULT_POLLMASK; return devnode->fops->poll(filp, poll); } static long __media_ioctl(struct file *filp, unsigned int cmd, unsigned long arg, long (*ioctl_func)(struct file *filp, unsigned int cmd, unsigned long arg)) { struct media_devnode *devnode = media_devnode_data(filp); if (!ioctl_func) return -ENOTTY; if (!media_devnode_is_registered(devnode)) return -EIO; return ioctl_func(filp, cmd, arg); } static long media_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct media_devnode *devnode = media_devnode_data(filp); return __media_ioctl(filp, cmd, arg, devnode->fops->ioctl); } #ifdef CONFIG_COMPAT static long media_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct media_devnode *devnode = media_devnode_data(filp); return __media_ioctl(filp, cmd, arg, devnode->fops->compat_ioctl); } #endif /* CONFIG_COMPAT */ /* Override for the open function */ static int media_open(struct inode *inode, struct file *filp) { struct media_devnode *devnode; int ret; /* Check if the media device is available. This needs to be done with * the media_devnode_lock held to prevent an open/unregister race: * without the lock, the device could be unregistered and freed between * the media_devnode_is_registered() and get_device() calls, leading to * a crash. */ mutex_lock(&media_devnode_lock); devnode = container_of(inode->i_cdev, struct media_devnode, cdev); /* return ENXIO if the media device has been removed already or if it is not registered anymore. */ if (!media_devnode_is_registered(devnode)) { mutex_unlock(&media_devnode_lock); return -ENXIO; } /* and increase the device refcount */ get_device(&devnode->dev); mutex_unlock(&media_devnode_lock); filp->private_data = devnode; if (devnode->fops->open) { ret = devnode->fops->open(filp); if (ret) { put_device(&devnode->dev); filp->private_data = NULL; return ret; } } return 0; } /* Override for the release function */ static int media_release(struct inode *inode, struct file *filp) { struct media_devnode *devnode = media_devnode_data(filp); if (devnode->fops->release) devnode->fops->release(filp); filp->private_data = NULL; /* decrease the refcount unconditionally since the release() return value is ignored. */ put_device(&devnode->dev); return 0; } static const struct file_operations media_devnode_fops = { .owner = THIS_MODULE, .read = media_read, .write = media_write, .open = media_open, .unlocked_ioctl = media_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = media_compat_ioctl, #endif /* CONFIG_COMPAT */ .release = media_release, .poll = media_poll, .llseek = no_llseek, }; int __must_check media_devnode_register(struct media_device *mdev, struct media_devnode *devnode, struct module *owner) { int minor; int ret; /* Part 1: Find a free minor number */ mutex_lock(&media_devnode_lock); minor = find_first_zero_bit(media_devnode_nums, MEDIA_NUM_DEVICES); if (minor == MEDIA_NUM_DEVICES) { mutex_unlock(&media_devnode_lock); pr_err("could not get a free minor\n"); kfree(devnode); return -ENFILE; } set_bit(minor, media_devnode_nums); mutex_unlock(&media_devnode_lock); devnode->minor = minor; devnode->media_dev = mdev; /* Part 1: Initialize dev now to use dev.kobj for cdev.kobj.parent */ devnode->dev.bus = &media_bus_type; devnode->dev.devt = MKDEV(MAJOR(media_dev_t), devnode->minor); devnode->dev.release = media_devnode_release; if (devnode->parent) devnode->dev.parent = devnode->parent; dev_set_name(&devnode->dev, "media%d", devnode->minor); device_initialize(&devnode->dev); /* Part 2: Initialize the character device */ cdev_init(&devnode->cdev, &media_devnode_fops); devnode->cdev.owner = owner; kobject_set_name(&devnode->cdev.kobj, "media%d", devnode->minor); /* Part 3: Add the media and char device */ set_bit(MEDIA_FLAG_REGISTERED, &devnode->flags); ret = cdev_device_add(&devnode->cdev, &devnode->dev); if (ret < 0) { clear_bit(MEDIA_FLAG_REGISTERED, &devnode->flags); pr_err("%s: cdev_device_add failed\n", __func__); goto cdev_add_error; } return 0; cdev_add_error: mutex_lock(&media_devnode_lock); clear_bit(devnode->minor, media_devnode_nums); devnode->media_dev = NULL; mutex_unlock(&media_devnode_lock); put_device(&devnode->dev); return ret; } void media_devnode_unregister_prepare(struct media_devnode *devnode) { /* Check if devnode was ever registered at all */ if (!media_devnode_is_registered(devnode)) return; mutex_lock(&media_devnode_lock); clear_bit(MEDIA_FLAG_REGISTERED, &devnode->flags); mutex_unlock(&media_devnode_lock); } void media_devnode_unregister(struct media_devnode *devnode) { mutex_lock(&media_devnode_lock); /* Delete the cdev on this minor as well */ cdev_device_del(&devnode->cdev, &devnode->dev); devnode->media_dev = NULL; mutex_unlock(&media_devnode_lock); put_device(&devnode->dev); } /* * Initialise media for linux */ static int __init media_devnode_init(void) { int ret; pr_info("Linux media interface: v0.10\n"); ret = alloc_chrdev_region(&media_dev_t, 0, MEDIA_NUM_DEVICES, MEDIA_NAME); if (ret < 0) { pr_warn("unable to allocate major\n"); return ret; } ret = bus_register(&media_bus_type); if (ret < 0) { unregister_chrdev_region(media_dev_t, MEDIA_NUM_DEVICES); pr_warn("bus_register failed\n"); return -EIO; } return 0; } static void __exit media_devnode_exit(void) { bus_unregister(&media_bus_type); unregister_chrdev_region(media_dev_t, MEDIA_NUM_DEVICES); } subsys_initcall(media_devnode_init); module_exit(media_devnode_exit) MODULE_AUTHOR("Laurent Pinchart <laurent.pinchart@ideasonboard.com>"); MODULE_DESCRIPTION("Device node registration for media drivers"); MODULE_LICENSE("GPL");
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Overflow-Connection Scheduling module * * Authors: Raducu Deaconu <rhadoo_io@yahoo.com> * * Scheduler implements "overflow" loadbalancing according to number of active * connections , will keep all connections to the node with the highest weight * and overflow to the next node if the number of connections exceeds the node's * weight. * Note that this scheduler might not be suitable for UDP because it only uses * active connections */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> /* OVF Connection scheduling */ static struct ip_vs_dest * ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *h = NULL; int hw = 0, w; IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n"); /* select the node with highest weight, go to next in line if active * connections exceed weight */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { w = atomic_read(&dest->weight); if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || atomic_read(&dest->activeconns) > w || w == 0) continue; if (!h || w > hw) { h = dest; hw = w; } } if (h) { IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n", IP_VS_DBG_ADDR(h->af, &h->addr), ntohs(h->port), atomic_read(&h->activeconns), atomic_read(&h->weight)); return h; } ip_vs_scheduler_err(svc, "no destination available"); return NULL; } static struct ip_vs_scheduler ip_vs_ovf_scheduler = { .name = "ovf", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list), .schedule = ip_vs_ovf_schedule, }; static int __init ip_vs_ovf_init(void) { return register_ip_vs_scheduler(&ip_vs_ovf_scheduler); } static void __exit ip_vs_ovf_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler); synchronize_rcu(); } module_init(ip_vs_ovf_init); module_exit(ip_vs_ovf_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs overflow connection scheduler");
2 1 36 98 110 31 37 3 2 36 14 13 9 36 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_KVM_CPUID_H #define ARCH_X86_KVM_CPUID_H #include "x86.h" #include "reverse_cpuid.h" #include <asm/cpu.h> #include <asm/processor.h> #include <uapi/asm/kvm_para.h> extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { return vcpu->arch.maxphyaddr; } static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { return !(gpa & vcpu->arch.reserved_gpa_bits); } static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t alignment) { return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa); } static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) { return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE); } static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, unsigned int leaf) { u32 *reg = cpuid_entry_get_reg(entry, leaf * 32); BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps)); *reg = kvm_cpu_caps[leaf]; } static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned int x86_feature) { const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; return __cpuid_entry_get_reg(entry, cpuid.reg); } static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned int x86_feature) { u32 *reg; reg = guest_cpuid_get_register(vcpu, x86_feature); if (!reg) return false; return *reg & __feature_bit(x86_feature); } static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned int x86_feature) { u32 *reg; reg = guest_cpuid_get_register(vcpu, x86_feature); if (reg) *reg &= ~__feature_bit(x86_feature); } static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) { return vcpu->arch.is_amd_compatible; } static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) { return !guest_cpuid_is_amd_compatible(vcpu); } static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_family(best->eax); } static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_model(best->eax); } static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) { return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); } static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_stepping(best->eax); } static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) { return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); } static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) { return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) || guest_cpuid_has(vcpu, X86_FEATURE_SBPB)); } static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; } static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_misc_features_enables & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; } static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); } static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature) { return !!kvm_cpu_cap_get(x86_feature); } static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) { if (boot_cpu_has(x86_feature)) kvm_cpu_cap_set(x86_feature); } static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, unsigned int kvm_feature) { if (!vcpu->arch.pv_cpuid.enforce) return true; return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } enum kvm_governed_features { #define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, #include "governed_features.h" KVM_NR_GOVERNED_FEATURES }; static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) { switch (x86_feature) { #define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; #include "governed_features.h" default: return -1; } } static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) { return kvm_governed_feature_index(x86_feature) >= 0; } static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, unsigned int x86_feature) { BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); __set_bit(kvm_governed_feature_index(x86_feature), vcpu->arch.governed_features.enabled); } static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, unsigned int x86_feature) { if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) kvm_governed_feature_set(vcpu, x86_feature); } static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, unsigned int x86_feature) { BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); return test_bit(kvm_governed_feature_index(x86_feature), vcpu->arch.governed_features.enabled); } static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (guest_can_use(vcpu, X86_FEATURE_LAM)) cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57); return kvm_vcpu_is_legal_gpa(vcpu, cr3); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0 */ /* * USB PHY defines * * These APIs may be used between USB controllers. USB device drivers * (for either host or peripheral roles) don't use these calls; they * continue to use just usb_device and usb_gadget. */ #ifndef __LINUX_USB_PHY_H #define __LINUX_USB_PHY_H #include <linux/extcon.h> #include <linux/notifier.h> #include <linux/usb.h> #include <uapi/linux/usb/charger.h> enum usb_phy_interface { USBPHY_INTERFACE_MODE_UNKNOWN, USBPHY_INTERFACE_MODE_UTMI, USBPHY_INTERFACE_MODE_UTMIW, USBPHY_INTERFACE_MODE_ULPI, USBPHY_INTERFACE_MODE_SERIAL, USBPHY_INTERFACE_MODE_HSIC, }; enum usb_phy_events { USB_EVENT_NONE, /* no events or cable disconnected */ USB_EVENT_VBUS, /* vbus valid event */ USB_EVENT_ID, /* id was grounded */ USB_EVENT_CHARGER, /* usb dedicated charger */ USB_EVENT_ENUMERATED, /* gadget driver enumerated */ }; /* associate a type with PHY */ enum usb_phy_type { USB_PHY_TYPE_UNDEFINED, USB_PHY_TYPE_USB2, USB_PHY_TYPE_USB3, }; /* OTG defines lots of enumeration states before device reset */ enum usb_otg_state { OTG_STATE_UNDEFINED = 0, /* single-role peripheral, and dual-role default-b */ OTG_STATE_B_IDLE, OTG_STATE_B_SRP_INIT, OTG_STATE_B_PERIPHERAL, /* extra dual-role default-b states */ OTG_STATE_B_WAIT_ACON, OTG_STATE_B_HOST, /* dual-role default-a */ OTG_STATE_A_IDLE, OTG_STATE_A_WAIT_VRISE, OTG_STATE_A_WAIT_BCON, OTG_STATE_A_HOST, OTG_STATE_A_SUSPEND, OTG_STATE_A_PERIPHERAL, OTG_STATE_A_WAIT_VFALL, OTG_STATE_A_VBUS_ERR, }; struct usb_phy; struct usb_otg; /* for phys connected thru an ULPI interface, the user must * provide access ops */ struct usb_phy_io_ops { int (*read)(struct usb_phy *x, u32 reg); int (*write)(struct usb_phy *x, u32 val, u32 reg); }; struct usb_charger_current { unsigned int sdp_min; unsigned int sdp_max; unsigned int dcp_min; unsigned int dcp_max; unsigned int cdp_min; unsigned int cdp_max; unsigned int aca_min; unsigned int aca_max; }; struct usb_phy { struct device *dev; const char *label; unsigned int flags; enum usb_phy_type type; enum usb_phy_events last_event; struct usb_otg *otg; struct device *io_dev; struct usb_phy_io_ops *io_ops; void __iomem *io_priv; /* to support extcon device */ struct extcon_dev *edev; struct extcon_dev *id_edev; struct notifier_block vbus_nb; struct notifier_block id_nb; struct notifier_block type_nb; /* Support USB charger */ enum usb_charger_type chg_type; enum usb_charger_state chg_state; struct usb_charger_current chg_cur; struct work_struct chg_work; /* for notification of usb_phy_events */ struct atomic_notifier_head notifier; /* to pass extra port status to the root hub */ u16 port_status; u16 port_change; /* to support controllers that have multiple phys */ struct list_head head; /* initialize/shutdown the phy */ int (*init)(struct usb_phy *x); void (*shutdown)(struct usb_phy *x); /* enable/disable VBUS */ int (*set_vbus)(struct usb_phy *x, int on); /* effective for B devices, ignored for A-peripheral */ int (*set_power)(struct usb_phy *x, unsigned mA); /* Set phy into suspend mode */ int (*set_suspend)(struct usb_phy *x, int suspend); /* * Set wakeup enable for PHY, in that case, the PHY can be * woken up from suspend status due to external events, * like vbus change, dp/dm change and id. */ int (*set_wakeup)(struct usb_phy *x, bool enabled); /* notify phy connect status change */ int (*notify_connect)(struct usb_phy *x, enum usb_device_speed speed); int (*notify_disconnect)(struct usb_phy *x, enum usb_device_speed speed); /* * Charger detection method can be implemented if you need to * manually detect the charger type. */ enum usb_charger_type (*charger_detect)(struct usb_phy *x); }; /* for board-specific init logic */ extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type); extern int usb_add_phy_dev(struct usb_phy *); extern void usb_remove_phy(struct usb_phy *); /* helpers for direct access thru low-level io interface */ static inline int usb_phy_io_read(struct usb_phy *x, u32 reg) { if (x && x->io_ops && x->io_ops->read) return x->io_ops->read(x, reg); return -EINVAL; } static inline int usb_phy_io_write(struct usb_phy *x, u32 val, u32 reg) { if (x && x->io_ops && x->io_ops->write) return x->io_ops->write(x, val, reg); return -EINVAL; } static inline int usb_phy_init(struct usb_phy *x) { if (x && x->init) return x->init(x); return 0; } static inline void usb_phy_shutdown(struct usb_phy *x) { if (x && x->shutdown) x->shutdown(x); } static inline int usb_phy_vbus_on(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, true); } static inline int usb_phy_vbus_off(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, false); } /* for usb host and peripheral controller drivers */ #if IS_ENABLED(CONFIG_USB_PHY) extern struct usb_phy *usb_get_phy(enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); extern void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max); extern void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state); #else static inline struct usb_phy *usb_get_phy(enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb) { return ERR_PTR(-ENXIO); } static inline void usb_put_phy(struct usb_phy *x) { } static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x) { } static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA) { } static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max) { } static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state) { } #endif static inline int usb_phy_set_power(struct usb_phy *x, unsigned mA) { if (!x) return 0; usb_phy_set_charger_current(x, mA); if (x->set_power) return x->set_power(x, mA); return 0; } /* Context: can sleep */ static inline int usb_phy_set_suspend(struct usb_phy *x, int suspend) { if (x && x->set_suspend != NULL) return x->set_suspend(x, suspend); else return 0; } static inline int usb_phy_set_wakeup(struct usb_phy *x, bool enabled) { if (x && x->set_wakeup) return x->set_wakeup(x, enabled); else return 0; } static inline int usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_connect) return x->notify_connect(x, speed); else return 0; } static inline int usb_phy_notify_disconnect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_disconnect) return x->notify_disconnect(x, speed); else return 0; } /* notifiers */ static inline int usb_register_notifier(struct usb_phy *x, struct notifier_block *nb) { return atomic_notifier_chain_register(&x->notifier, nb); } static inline void usb_unregister_notifier(struct usb_phy *x, struct notifier_block *nb) { atomic_notifier_chain_unregister(&x->notifier, nb); } static inline const char *usb_phy_type_string(enum usb_phy_type type) { switch (type) { case USB_PHY_TYPE_USB2: return "USB2 PHY"; case USB_PHY_TYPE_USB3: return "USB3 PHY"; default: return "UNKNOWN PHY TYPE"; } } #endif /* __LINUX_USB_PHY_H */
10 10 6 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/btf_ids.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> /* * bpf_arena is a sparsely populated shared memory region between bpf program and * user space process. * * For example on x86-64 the values could be: * user_vm_start 7f7d26200000 // picked by mmap() * kern_vm_start ffffc90001e69000 // picked by get_vm_area() * For user space all pointers within the arena are normal 8-byte addresses. * In this example 7f7d26200000 is the address of the first page (pgoff=0). * The bpf program will access it as: kern_vm_start + lower_32bit_of_user_ptr * (u32)7f7d26200000 -> 26200000 * hence * ffffc90001e69000 + 26200000 == ffffc90028069000 is "pgoff=0" within 4Gb * kernel memory region. * * BPF JITs generate the following code to access arena: * mov eax, eax // eax has lower 32-bit of user pointer * mov word ptr [rax + r12 + off], bx * where r12 == kern_vm_start and off is s16. * Hence allocate 4Gb + GUARD_SZ/2 on each side. * * Initially kernel vm_area and user vma are not populated. * User space can fault-in any address which will insert the page * into kernel and user vma. * bpf program can allocate a page via bpf_arena_alloc_pages() kfunc * which will insert it into kernel vm_area. * The later fault-in from user space will populate that page into user vma. */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ #define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { struct bpf_map map; u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; struct maple_tree mt; struct list_head vma_list; struct mutex lock; }; u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return arena ? (u64) (long) arena->kern_vm->addr + GUARD_SZ / 2 : 0; } u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) { return arena ? arena->user_vm_start : 0; } static long arena_map_peek_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } static long arena_map_push_elem(struct bpf_map *map, void *value, u64 flags) { return -EOPNOTSUPP; } static long arena_map_pop_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } static long arena_map_delete_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } static int arena_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -EOPNOTSUPP; } static long compute_pgoff(struct bpf_arena *arena, long uaddr) { return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; } static struct bpf_map *arena_map_alloc(union bpf_attr *attr) { struct vm_struct *kern_vm; int numa_node = bpf_map_attr_numa_node(attr); struct bpf_arena *arena; u64 vm_range; int err = -ENOMEM; if (attr->key_size || attr->value_size || attr->max_entries == 0 || /* BPF_F_MMAPABLE must be set */ !(attr->map_flags & BPF_F_MMAPABLE) || /* No unsupported flags present */ (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV))) return ERR_PTR(-EINVAL); if (attr->map_extra & ~PAGE_MASK) /* If non-zero the map_extra is an expected user VMA start address */ return ERR_PTR(-EINVAL); vm_range = (u64)attr->max_entries * PAGE_SIZE; if (vm_range > SZ_4G) return ERR_PTR(-E2BIG); if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) /* user vma must not cross 32-bit boundary */ return ERR_PTR(-ERANGE); kern_vm = get_vm_area(KERN_VM_SZ, VM_SPARSE | VM_USERMAP); if (!kern_vm) return ERR_PTR(-ENOMEM); arena = bpf_map_area_alloc(sizeof(*arena), numa_node); if (!arena) goto err; arena->kern_vm = kern_vm; arena->user_vm_start = attr->map_extra; if (arena->user_vm_start) arena->user_vm_end = arena->user_vm_start + vm_range; INIT_LIST_HEAD(&arena->vma_list); bpf_map_init_from_attr(&arena->map, attr); mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE); mutex_init(&arena->lock); return &arena->map; err: free_vm_area(kern_vm); return ERR_PTR(err); } static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) { struct page *page; pte_t pte; pte = ptep_get(ptep); if (!pte_present(pte)) /* sanity check */ return 0; page = pte_page(pte); /* * We do not update pte here: * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug * 2. TLB flushing is batched or deferred. Even if we clear pte, * the TLB entries can stick around and continue to permit access to * the freed page. So it all relies on 1. */ __free_page(page); return 0; } static void arena_map_free(struct bpf_map *map) { struct bpf_arena *arena = container_of(map, struct bpf_arena, map); /* * Check that user vma-s are not around when bpf map is freed. * mmap() holds vm_file which holds bpf_map refcnt. * munmap() must have happened on vma followed by arena_vm_close() * which would clear arena->vma_list. */ if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) return; /* * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). * It unmaps everything from vmalloc area and clears pgtables. * Call apply_to_existing_page_range() first to find populated ptes and * free those pages. */ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); free_vm_area(arena->kern_vm); mtree_destroy(&arena->mt); bpf_map_area_free(arena); } static void *arena_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EINVAL); } static long arena_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return -EOPNOTSUPP; } static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return 0; } static u64 arena_map_mem_usage(const struct bpf_map *map) { return 0; } struct vma_list { struct vm_area_struct *vma; struct list_head head; atomic_t mmap_count; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) { struct vma_list *vml; vml = kmalloc(sizeof(*vml), GFP_KERNEL); if (!vml) return -ENOMEM; atomic_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; list_add(&vml->head, &arena->vma_list); return 0; } static void arena_vm_open(struct vm_area_struct *vma) { struct vma_list *vml = vma->vm_private_data; atomic_inc(&vml->mmap_count); } static void arena_vm_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); struct vma_list *vml = vma->vm_private_data; if (!atomic_dec_and_test(&vml->mmap_count)) return; guard(mutex)(&arena->lock); /* update link list under lock */ list_del(&vml->head); vma->vm_private_data = NULL; kfree(vml); } #define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) { struct bpf_map *map = vmf->vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); struct page *page; long kbase, kaddr; int ret; kbase = bpf_arena_get_kern_vm_start(arena); kaddr = kbase + (u32)(vmf->address); guard(mutex)(&arena->lock); page = vmalloc_to_page((void *)kaddr); if (page) /* already have a page vmap-ed */ goto out; if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ return VM_FAULT_SIGSEGV; ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL); if (ret) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); if (ret) { mtree_erase(&arena->mt, vmf->pgoff); return VM_FAULT_SIGSEGV; } ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); if (ret) { mtree_erase(&arena->mt, vmf->pgoff); __free_page(page); return VM_FAULT_SIGSEGV; } out: page_ref_add(page, 1); vmf->page = page; return 0; } static const struct vm_operations_struct arena_vm_ops = { .open = arena_vm_open, .close = arena_vm_close, .fault = arena_vm_fault, }; static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct bpf_map *map = filp->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); long ret; if (pgoff) return -EINVAL; if (len > SZ_4G) return -E2BIG; /* if user_vm_start was specified at arena creation time */ if (arena->user_vm_start) { if (len > arena->user_vm_end - arena->user_vm_start) return -E2BIG; if (len != arena->user_vm_end - arena->user_vm_start) return -EINVAL; if (addr != arena->user_vm_start) return -EINVAL; } ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags); if (IS_ERR_VALUE(ret)) return ret; if ((ret >> 32) == ((ret + len - 1) >> 32)) return ret; if (WARN_ON_ONCE(arena->user_vm_start)) /* checks at map creation time should prevent this */ return -EFAULT; return round_up(ret, SZ_4G); } static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_arena *arena = container_of(map, struct bpf_arena, map); guard(mutex)(&arena->lock); if (arena->user_vm_start && arena->user_vm_start != vma->vm_start) /* * If map_extra was not specified at arena creation time then * 1st user process can do mmap(NULL, ...) to pick user_vm_start * 2nd user process must pass the same addr to mmap(addr, MAP_FIXED..); * or * specify addr in map_extra and * use the same addr later with mmap(addr, MAP_FIXED..); */ return -EBUSY; if (arena->user_vm_end && arena->user_vm_end != vma->vm_end) /* all user processes must have the same size of mmap-ed region */ return -EBUSY; /* Earlier checks should prevent this */ if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff)) return -EFAULT; if (remember_vma(arena, vma)) return -ENOMEM; arena->user_vm_start = vma->vm_start; arena->user_vm_end = vma->vm_end; /* * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid * potential change of user_vm_start. */ vm_flags_set(vma, VM_DONTEXPAND); vma->vm_ops = &arena_vm_ops; return 0; } static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if ((u64)off > arena->user_vm_end - arena->user_vm_start) return -ERANGE; *imm = (unsigned long)arena->user_vm_start; return 0; } BTF_ID_LIST_SINGLE(bpf_arena_map_btf_ids, struct, bpf_arena) const struct bpf_map_ops arena_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = arena_map_alloc, .map_free = arena_map_free, .map_direct_value_addr = arena_map_direct_value_addr, .map_mmap = arena_map_mmap, .map_get_unmapped_area = arena_get_unmapped_area, .map_get_next_key = arena_map_get_next_key, .map_push_elem = arena_map_push_elem, .map_peek_elem = arena_map_peek_elem, .map_pop_elem = arena_map_pop_elem, .map_lookup_elem = arena_map_lookup_elem, .map_update_elem = arena_map_update_elem, .map_delete_elem = arena_map_delete_elem, .map_check_btf = arena_map_check_btf, .map_mem_usage = arena_map_mem_usage, .map_btf_id = &bpf_arena_map_btf_ids[0], }; static u64 clear_lo32(u64 val) { return val & ~(u64)~0U; } /* * Allocate pages and vmap them into kernel vmalloc area. * Later the pages will be mmaped into user space vma. */ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) { /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); struct page **pages; long pgoff = 0; u32 uaddr32; int ret, i; if (page_cnt > page_cnt_max) return 0; if (uaddr) { if (uaddr & ~PAGE_MASK) return 0; pgoff = compute_pgoff(arena, uaddr); if (pgoff > page_cnt_max - page_cnt) /* requested address will be outside of user VMA */ return 0; } /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); if (!pages) return 0; guard(mutex)(&arena->lock); if (uaddr) ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1, MT_ENTRY, GFP_KERNEL); else ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY, page_cnt, 0, page_cnt_max - 1, GFP_KERNEL); if (ret) goto out_free_pages; ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, node_id, page_cnt, pages); if (ret) goto out; uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 * will not overflow 32-bit. Lower 32-bit need to represent * contiguous user address range. * Map these pages at kern_vm_start base. * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow * lower 32-bit and it's ok. */ ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); if (ret) { for (i = 0; i < page_cnt; i++) __free_page(pages[i]); goto out; } kvfree(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: mtree_erase(&arena->mt, pgoff); out_free_pages: kvfree(pages); return 0; } /* * If page is present in vmalloc area, unmap it from vmalloc area, * unmap it from all user space vma-s, * and free it. */ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { struct vma_list *vml; list_for_each_entry(vml, &arena->vma_list, head) zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt, NULL); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { u64 full_uaddr, uaddr_end; long kaddr, pgoff, i; struct page *page; /* only aligned lower 32-bit are relevant */ uaddr = (u32)uaddr; uaddr &= PAGE_MASK; full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); if (full_uaddr >= uaddr_end) return; page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; guard(mutex)(&arena->lock); pgoff = compute_pgoff(arena, uaddr); /* clear range */ mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL); if (page_cnt > 1) /* bulk zap if multiple pages being freed */ zap_pages(arena, full_uaddr, page_cnt); kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { page = vmalloc_to_page((void *)kaddr); if (!page) continue; if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there * is no need to call zap_pages which is slow. When * page_cnt is big it's faster to do the batched zap. */ zap_pages(arena, full_uaddr, 1); vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); __free_page(page); } } __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, int node_id, u64 flags) { struct bpf_map *map = p__map; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) return NULL; return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); } __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) return; arena_free_pages(arena, (long)ptr__ign, page_cnt); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, .set = &arena_kfuncs, }; static int __init kfunc_init(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init);
69 66 66 69 8 8 8 194 176 165 19 175 69 194 73 196 196 67 8 8 2 6 69 7 69 3 67 8 8 1 67 67 67 7 67 66 66 66 66 14 14 14 12 1 14 14 14 2 2 2 2 2 2 14 1 14 66 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS dat/inode allocator * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Originally written by Koji Sato. * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/bitops.h> #include <linux/slab.h> #include "mdt.h" #include "alloc.h" /** * nilfs_palloc_groups_per_desc_block - get the number of groups that a group * descriptor block can maintain * @inode: inode of metadata file using this allocator */ static inline unsigned long nilfs_palloc_groups_per_desc_block(const struct inode *inode) { return i_blocksize(inode) / sizeof(struct nilfs_palloc_group_desc); } /** * nilfs_palloc_groups_count - get maximum number of groups * @inode: inode of metadata file using this allocator */ static inline unsigned long nilfs_palloc_groups_count(const struct inode *inode) { return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); } /** * nilfs_palloc_init_blockgroup - initialize private variables for allocator * @inode: inode of metadata file using this allocator * @entry_size: size of the persistent object */ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS); if (!mi->mi_bgl) return -ENOMEM; bgl_lock_init(mi->mi_bgl); nilfs_mdt_set_entry_size(inode, entry_size, 0); mi->mi_blocks_per_group = DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), mi->mi_entries_per_block) + 1; /* * Number of blocks in a group including entry blocks * and a bitmap block */ mi->mi_blocks_per_desc_block = nilfs_palloc_groups_per_desc_block(inode) * mi->mi_blocks_per_group + 1; /* * Number of blocks per descriptor including the * descriptor block */ return 0; } /** * nilfs_palloc_group - get group number and offset from an entry number * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) * @offset: pointer to store offset number in the group */ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, unsigned long *offset) { __u64 group = nr; *offset = do_div(group, nilfs_palloc_entries_per_group(inode)); return group; } /** * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number * * nilfs_palloc_desc_blkoff() returns block offset of the descriptor * block which contains a descriptor of the specified group. */ static unsigned long nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) { unsigned long desc_block = group / nilfs_palloc_groups_per_desc_block(inode); return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; } /** * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block * @inode: inode of metadata file using this allocator * @group: group number * * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap * block used to allocate/deallocate entries in the specified group. */ static unsigned long nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) { unsigned long desc_offset = group % nilfs_palloc_groups_per_desc_block(inode); return nilfs_palloc_desc_blkoff(inode, group) + 1 + desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; } /** * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group * @desc: pointer to descriptor structure for the group * @lock: spin lock protecting @desc */ static unsigned long nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, spinlock_t *lock) { unsigned long nfree; spin_lock(lock); nfree = le32_to_cpu(desc->pg_nfrees); spin_unlock(lock); return nfree; } /** * nilfs_palloc_group_desc_add_entries - adjust count of free entries * @desc: pointer to descriptor structure for the group * @lock: spin lock protecting @desc * @n: delta to be added */ static u32 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, spinlock_t *lock, u32 n) { u32 nfree; spin_lock(lock); le32_add_cpu(&desc->pg_nfrees, n); nfree = le32_to_cpu(desc->pg_nfrees); spin_unlock(lock); return nfree; } /** * nilfs_palloc_entry_blkoff - get block offset of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) */ static unsigned long nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) { unsigned long group, group_offset; group = nilfs_palloc_group(inode, nr, &group_offset); return nilfs_palloc_bitmap_blkoff(inode, group) + 1 + group_offset / NILFS_MDT(inode)->mi_entries_per_block; } /** * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block * @inode: inode of metadata file * @bh: buffer head of the buffer to be initialized * @kaddr: kernel address mapped for the page including the buffer */ static void nilfs_palloc_desc_block_init(struct inode *inode, struct buffer_head *bh, void *kaddr) { struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); unsigned long n = nilfs_palloc_groups_per_desc_block(inode); __le32 nfrees; nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode)); while (n-- > 0) { desc->pg_nfrees = nfrees; desc++; } } static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, int create, void (*init_block)(struct inode *, struct buffer_head *, void *), struct buffer_head **bhp, struct nilfs_bh_assoc *prev, spinlock_t *lock) { int ret; spin_lock(lock); if (prev->bh && blkoff == prev->blkoff && likely(buffer_uptodate(prev->bh))) { get_bh(prev->bh); *bhp = prev->bh; spin_unlock(lock); return 0; } spin_unlock(lock); ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp); if (!ret) { spin_lock(lock); /* * The following code must be safe for change of the * cache contents during the get block call. */ brelse(prev->bh); get_bh(*bhp); prev->bh = *bhp; prev->blkoff = blkoff; spin_unlock(lock); } return ret; } /** * nilfs_palloc_delete_block - delete a block on the persistent allocator file * @inode: inode of metadata file using this allocator * @blkoff: block offset * @prev: nilfs_bh_assoc struct of the last used buffer * @lock: spin lock protecting @prev */ static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, struct nilfs_bh_assoc *prev, spinlock_t *lock) { spin_lock(lock); if (prev->bh && blkoff == prev->blkoff) { brelse(prev->bh); prev->bh = NULL; } spin_unlock(lock); return nilfs_mdt_delete_block(inode, blkoff); } /** * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number * @create: create flag * @bhp: pointer to store the resultant buffer head */ static int nilfs_palloc_get_desc_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_get_block(inode, nilfs_palloc_desc_blkoff(inode, group), create, nilfs_palloc_desc_block_init, bhp, &cache->prev_desc, &cache->lock); } /** * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block * @inode: inode of metadata file using this allocator * @group: group number * @create: create flag * @bhp: pointer to store the resultant buffer head */ static int nilfs_palloc_get_bitmap_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_get_block(inode, nilfs_palloc_bitmap_blkoff(inode, group), create, NULL, bhp, &cache->prev_bitmap, &cache->lock); } /** * nilfs_palloc_delete_bitmap_block - delete a bitmap block * @inode: inode of metadata file using this allocator * @group: group number */ static int nilfs_palloc_delete_bitmap_block(struct inode *inode, unsigned long group) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_delete_block(inode, nilfs_palloc_bitmap_blkoff(inode, group), &cache->prev_bitmap, &cache->lock); } /** * nilfs_palloc_get_entry_block - get buffer head of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) * @create: create flag * @bhp: pointer to store the resultant buffer head */ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, int create, struct buffer_head **bhp) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), create, NULL, bhp, &cache->prev_entry, &cache->lock); } /** * nilfs_palloc_delete_entry_block - delete an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry */ static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_delete_block(inode, nilfs_palloc_entry_blkoff(inode, nr), &cache->prev_entry, &cache->lock); } /** * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor * @inode: inode of metadata file using this allocator * @group: group number * @bh: buffer head of the buffer storing the group descriptor block * @kaddr: kernel address mapped for the page including the buffer */ static struct nilfs_palloc_group_desc * nilfs_palloc_block_get_group_desc(const struct inode *inode, unsigned long group, const struct buffer_head *bh, void *kaddr) { return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + group % nilfs_palloc_groups_per_desc_block(inode); } /** * nilfs_palloc_block_get_entry - get kernel address of an entry * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) * @bh: buffer head of the buffer storing the entry block * @kaddr: kernel address mapped for the page including the buffer */ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, const struct buffer_head *bh, void *kaddr) { unsigned long entry_offset, group_offset; nilfs_palloc_group(inode, nr, &group_offset); entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; return kaddr + bh_offset(bh) + entry_offset * NILFS_MDT(inode)->mi_entry_size; } /** * nilfs_palloc_find_available_slot - find available slot in a group * @bitmap: bitmap of the group * @target: offset number of an entry in the group (start point) * @bsize: size in bits * @lock: spin lock protecting @bitmap * @wrap: whether to wrap around */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, unsigned int bsize, spinlock_t *lock, bool wrap) { int pos, end = bsize; if (likely(target < bsize)) { pos = target; do { pos = nilfs_find_next_zero_bit(bitmap, end, pos); if (pos >= end) break; if (!nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; } while (++pos < end); end = target; } if (!wrap) return -ENOSPC; /* wrap around */ for (pos = 0; pos < end; pos++) { pos = nilfs_find_next_zero_bit(bitmap, end, pos); if (pos >= end) break; if (!nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; } return -ENOSPC; } /** * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups * in a group descriptor block * @inode: inode of metadata file using this allocator * @curr: current group number * @max: maximum number of groups */ static unsigned long nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, unsigned long curr, unsigned long max) { return min_t(unsigned long, nilfs_palloc_groups_per_desc_block(inode) - curr % nilfs_palloc_groups_per_desc_block(inode), max - curr + 1); } /** * nilfs_palloc_count_desc_blocks - count descriptor blocks number * @inode: inode of metadata file using this allocator * @desc_blocks: descriptor blocks number [out] */ static int nilfs_palloc_count_desc_blocks(struct inode *inode, unsigned long *desc_blocks) { __u64 blknum; int ret; ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum); if (likely(!ret)) *desc_blocks = DIV_ROUND_UP( (unsigned long)blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block); return ret; } /** * nilfs_palloc_mdt_file_can_grow - check potential opportunity for * MDT file growing * @inode: inode of metadata file using this allocator * @desc_blocks: known current descriptor blocks count */ static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, unsigned long desc_blocks) { return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) < nilfs_palloc_groups_count(inode); } /** * nilfs_palloc_count_max_entries - count max number of entries that can be * described by descriptor blocks count * @inode: inode of metadata file using this allocator * @nused: current number of used entries * @nmaxp: max number of entries [out] */ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) { unsigned long desc_blocks = 0; u64 entries_per_desc_block, nmax; int err; err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks); if (unlikely(err)) return err; entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) * nilfs_palloc_groups_per_desc_block(inode); nmax = entries_per_desc_block * desc_blocks; if (nused == nmax && nilfs_palloc_mdt_file_can_grow(inode, desc_blocks)) nmax += entries_per_desc_block; if (nused > nmax) return -ERANGE; *nmaxp = nmax; return 0; } /** * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation * @wrap: whether to wrap around */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req, bool wrap) { struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, maxgroup, ngroups; unsigned long group_offset, maxgroup_offset; unsigned long n, entries_per_group; unsigned long i, j; spinlock_t *lock; int pos, ret; ngroups = nilfs_palloc_groups_count(inode); maxgroup = ngroups - 1; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); entries_per_group = nilfs_palloc_entries_per_group(inode); for (i = 0; i < ngroups; i += n) { if (group >= ngroups && wrap) { /* wrap around */ group = 0; maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, &maxgroup_offset) - 1; } ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; desc_kaddr = kmap_local_page(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); for (j = 0; j < n; j++, desc++, group++, group_offset = 0) { lock = nilfs_mdt_bgl_lock(inode, group); if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0) continue; kunmap_local(desc_kaddr); ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); if (unlikely(ret < 0)) { brelse(desc_bh); return ret; } desc_kaddr = kmap_local_page(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); pos = nilfs_palloc_find_available_slot( bitmap, group_offset, entries_per_group, lock, wrap); /* * Since the search for a free slot in the second and * subsequent bitmap blocks always starts from the * beginning, the wrap flag only has an effect on the * first search. */ kunmap_local(bitmap_kaddr); if (pos >= 0) goto found; brelse(bitmap_bh); } kunmap_local(desc_kaddr); brelse(desc_bh); } /* no entries left */ return -ENOSPC; found: /* found a free entry */ nilfs_palloc_group_desc_add_entries(desc, lock, -1); req->pr_entry_nr = entries_per_group * group + pos; kunmap_local(desc_kaddr); req->pr_desc_bh = desc_bh; req->pr_bitmap_bh = bitmap_bh; return 0; } /** * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation */ void nilfs_palloc_commit_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { mark_buffer_dirty(req->pr_bitmap_bh); mark_buffer_dirty(req->pr_desc_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); } /** * nilfs_palloc_commit_free_entry - finish deallocating a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the removal */ void nilfs_palloc_commit_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { struct nilfs_palloc_group_desc *desc; unsigned long group, group_offset; unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, "%s (ino=%lu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap_local(bitmap_kaddr); kunmap_local(desc_kaddr); mark_buffer_dirty(req->pr_desc_bh); mark_buffer_dirty(req->pr_bitmap_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); } /** * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation */ void nilfs_palloc_abort_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { struct nilfs_palloc_group_desc *desc; void *desc_kaddr, *bitmap_kaddr; unsigned char *bitmap; unsigned long group, group_offset; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc(inode, group, req->pr_desc_bh, desc_kaddr); bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, "%s (ino=%lu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap_local(bitmap_kaddr); kunmap_local(desc_kaddr); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); req->pr_entry_nr = 0; req->pr_bitmap_bh = NULL; req->pr_desc_bh = NULL; } /** * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the removal */ int nilfs_palloc_prepare_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { struct buffer_head *desc_bh, *bitmap_bh; unsigned long group, group_offset; int ret; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); if (ret < 0) { brelse(desc_bh); return ret; } req->pr_desc_bh = desc_bh; req->pr_bitmap_bh = bitmap_bh; return 0; } /** * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the removal */ void nilfs_palloc_abort_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); req->pr_entry_nr = 0; req->pr_bitmap_bh = NULL; req->pr_desc_bh = NULL; } /** * nilfs_palloc_freev - deallocate a set of persistent objects * @inode: inode of metadata file using this allocator * @entry_nrs: array of entry numbers to be deallocated * @nitems: number of entries stored in @entry_nrs */ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) { struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, group_offset; __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block; unsigned int entry_start, end, pos; spinlock_t *lock; int i, j, k, ret; u32 nfree; for (i = 0; i < nitems; i = j) { int change_group = false; int nempties = 0, n = 0; group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); if (ret < 0) return ret; ret = nilfs_palloc_get_bitmap_block(inode, group, 0, &bitmap_bh); if (ret < 0) { brelse(desc_bh); return ret; } /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; bitmap_kaddr = kmap_local_page(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); j = i; entry_start = rounddown(group_offset, epb); do { if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warn(inode->i_sb, "%s (ino=%lu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)entry_nrs[j]); } else { n++; } j++; if (j >= nitems || entry_nrs[j] < group_min_nr || entry_nrs[j] >= group_min_nr + epg) { change_group = true; } else { group_offset = entry_nrs[j] - group_min_nr; if (group_offset >= entry_start && group_offset < entry_start + epb) { /* This entry is in the same block */ continue; } } /* Test if the entry block is empty or not */ end = entry_start + epb; pos = nilfs_find_next_bit(bitmap, end, entry_start); if (pos >= end) { last_nrs[nempties++] = entry_nrs[j - 1]; if (nempties >= ARRAY_SIZE(last_nrs)) break; } if (change_group) break; /* Go on to the next entry block */ entry_start = rounddown(group_offset, epb); } while (true); kunmap_local(bitmap_kaddr); mark_buffer_dirty(bitmap_bh); brelse(bitmap_bh); for (k = 0; k < nempties; k++) { ret = nilfs_palloc_delete_entry_block(inode, last_nrs[k]); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, "error %d deleting block that object (entry=%llu, ino=%lu) belongs to", ret, (unsigned long long)last_nrs[k], inode->i_ino); } desc_kaddr = kmap_local_page(desc_bh->b_page); desc = nilfs_palloc_block_get_group_desc( inode, group, desc_bh, desc_kaddr); nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); kunmap_local(desc_kaddr); mark_buffer_dirty(desc_bh); nilfs_mdt_mark_dirty(inode); brelse(desc_bh); if (nfree == nilfs_palloc_entries_per_group(inode)) { ret = nilfs_palloc_delete_bitmap_block(inode, group); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, "error %d deleting bitmap block of group=%lu, ino=%lu", ret, group, inode->i_ino); } } return 0; } void nilfs_palloc_setup_cache(struct inode *inode, struct nilfs_palloc_cache *cache) { NILFS_MDT(inode)->mi_palloc_cache = cache; spin_lock_init(&cache->lock); } void nilfs_palloc_clear_cache(struct inode *inode) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; spin_lock(&cache->lock); brelse(cache->prev_desc.bh); brelse(cache->prev_bitmap.bh); brelse(cache->prev_entry.bh); cache->prev_desc.bh = NULL; cache->prev_bitmap.bh = NULL; cache->prev_entry.bh = NULL; spin_unlock(&cache->lock); } void nilfs_palloc_destroy_cache(struct inode *inode) { nilfs_palloc_clear_cache(inode); NILFS_MDT(inode)->mi_palloc_cache = NULL; }
62 62 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 // SPDX-License-Identifier: GPL-2.0-or-later /* 6LoWPAN fragment reassembly * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/ipv6/reassembly.c */ #define pr_fmt(fmt) "6LoWPAN: " fmt #include <linux/net.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ieee802154_netdev.h> #include <net/6lowpan.h> #include <net/ipv6_frag.h> #include <net/inet_frag.h> #include <net/ip.h> #include "6lowpan_i.h" static const char lowpan_frags_cache_name[] = "lowpan-frags"; static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev, struct net_device *ldev); static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { const struct frag_lowpan_compare_key *key = a; BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); } static inline struct lowpan_frag_queue * fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); struct frag_lowpan_compare_key key = {}; struct inet_frag_queue *q; key.tag = cb->d_tag; key.d_size = cb->d_size; key.src = *src; key.dst = *dst; q = inet_frag_find(ieee802154_lowpan->fqdir, &key); if (!q) return NULL; return container_of(q, struct lowpan_frag_queue, q); } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, struct sk_buff *skb, u8 frag_type) { struct sk_buff *prev_tail; struct net_device *ldev; int end, offset, err; /* inet_frag_queue_* functions use skb->cb; see struct ipfrag_skb_cb * in inet_fragment.c */ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet_skb_parm)); BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet6_skb_parm)); if (fq->q.flags & INET_FRAG_COMPLETE) goto err; offset = lowpan_802154_cb(skb)->d_offset << 3; end = lowpan_802154_cb(skb)->d_size; /* Is this the final fragment? */ if (offset + skb->len == end) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } } ldev = skb->dev; if (ldev) skb->dev = NULL; barrier(); prev_tail = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) goto err; fq->q.stamp = skb->tstamp; fq->q.tstamp_type = skb->tstamp_type; if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; fq->q.meat += skb->len; add_frag_mem_limit(fq->q.fqdir, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { int res; unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; res = lowpan_frag_reasm(fq, skb, prev_tail, ldev); skb->_skb_refdst = orefdst; return res; } skb_dst_drop(skb); return -1; err: kfree_skb(skb); return -1; } /* Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *ldev) { void *reasm_data; inet_frag_kill(&fq->q); reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->dev = ldev; skb->tstamp = fq->q.stamp; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 1; out_oom: net_dbg_ratelimited("lowpan_frag_reasm: no memory for reassembly\n"); return -1; } static int lowpan_frag_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res) { switch (res) { case RX_QUEUED: return NET_RX_SUCCESS; case RX_CONTINUE: /* nobody cared about this packet */ net_warn_ratelimited("%s: received unknown dispatch\n", __func__); fallthrough; default: /* all others failure */ return NET_RX_DROP; } } static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb) { int ret; if (!lowpan_is_iphc(*skb_network_header(skb))) return RX_CONTINUE; ret = lowpan_iphc_decompress(skb); if (ret < 0) return RX_DROP; return RX_QUEUED; } static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb) { lowpan_rx_result res; #define CALL_RXH(rxh) \ do { \ res = rxh(skb); \ if (res != RX_CONTINUE) \ goto rxh_next; \ } while (0) /* likely at first */ CALL_RXH(lowpan_frag_rx_h_iphc); CALL_RXH(lowpan_rx_h_ipv6); rxh_next: return lowpan_frag_rx_handlers_result(skb, res); #undef CALL_RXH } #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07 #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8 static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type, struct lowpan_802154_cb *cb) { bool fail; u8 high = 0, low = 0; __be16 d_tag = 0; fail = lowpan_fetch_skb(skb, &high, 1); fail |= lowpan_fetch_skb(skb, &low, 1); /* remove the dispatch value and use first three bits as high value * for the datagram size */ cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) << LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low; fail |= lowpan_fetch_skb(skb, &d_tag, 2); cb->d_tag = ntohs(d_tag); if (frag_type == LOWPAN_DISPATCH_FRAGN) { fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1); } else { skb_reset_network_header(skb); cb->d_offset = 0; /* check if datagram_size has ipv6hdr on FRAG1 */ fail |= cb->d_size < sizeof(struct ipv6hdr); /* check if we can dereference the dispatch value */ fail |= !skb->len; } if (unlikely(fail)) return -EIO; return 0; } int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) { struct lowpan_frag_queue *fq; struct net *net = dev_net(skb->dev); struct lowpan_802154_cb *cb = lowpan_802154_cb(skb); struct ieee802154_hdr hdr = {}; int err; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) goto err; err = lowpan_get_cb(skb, frag_type, cb); if (err < 0) goto err; if (frag_type == LOWPAN_DISPATCH_FRAG1) { err = lowpan_invoke_frag_rx_handlers(skb); if (err == NET_RX_DROP) goto err; } if (cb->d_size > IPV6_MIN_MTU) { net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; } fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { int ret; spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); return ret; } err: kfree_skb(skb); return -1; } #ifdef CONFIG_SYSCTL static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; /* secret interval has been deprecated */ static int lowpan_frags_secret_interval_unused; static struct ctl_table lowpan_frags_ctl_table[] = { { .procname = "6lowpanfrag_secret_interval", .data = &lowpan_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); size_t table_size = ARRAY_SIZE(lowpan_frags_ns_ctl_table); table = lowpan_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(lowpan_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) table_size = 0; } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh; table[1].data = &ieee802154_lowpan->fqdir->low_thresh; table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; table[2].data = &ieee802154_lowpan->fqdir->timeout; hdr = register_net_sysctl_sz(net, "net/ieee802154/6lowpan", table, table_size); if (hdr == NULL) goto err_reg; ieee802154_lowpan->sysctl.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) { const struct ctl_table *table; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); table = ieee802154_lowpan->sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(ieee802154_lowpan->sysctl.frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } static struct ctl_table_header *lowpan_ctl_header; static int __init lowpan_frags_sysctl_register(void) { lowpan_ctl_header = register_net_sysctl(&init_net, "net/ieee802154/6lowpan", lowpan_frags_ctl_table); return lowpan_ctl_header == NULL ? -ENOMEM : 0; } static void lowpan_frags_sysctl_unregister(void) { unregister_net_sysctl_table(lowpan_ctl_header); } #else static inline int lowpan_frags_ns_sysctl_register(struct net *net) { return 0; } static inline void lowpan_frags_ns_sysctl_unregister(struct net *net) { } static inline int __init lowpan_frags_sysctl_register(void) { return 0; } static inline void lowpan_frags_sysctl_unregister(void) { } #endif static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); int res; res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net); if (res < 0) return res; ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = lowpan_frags_ns_sysctl_register(net); if (res < 0) fqdir_exit(ieee802154_lowpan->fqdir); return res; } static void __net_exit lowpan_frags_pre_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); fqdir_pre_exit(ieee802154_lowpan->fqdir); } static void __net_exit lowpan_frags_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); fqdir_exit(ieee802154_lowpan->fqdir); } static struct pernet_operations lowpan_frags_ops = { .init = lowpan_frags_init_net, .pre_exit = lowpan_frags_pre_exit_net, .exit = lowpan_frags_exit_net, }; static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_lowpan_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params lowpan_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = lowpan_key_hashfn, .obj_hashfn = lowpan_obj_hashfn, .obj_cmpfn = lowpan_obj_cmpfn, .automatic_shrinking = true, }; int __init lowpan_net_frag_init(void) { int ret; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) goto out; ret = lowpan_frags_sysctl_register(); if (ret) goto err_sysctl; ret = register_pernet_subsys(&lowpan_frags_ops); if (ret) goto err_pernet; out: return ret; err_pernet: lowpan_frags_sysctl_unregister(); err_sysctl: inet_frags_fini(&lowpan_frags); return ret; } void lowpan_net_frag_exit(void) { lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); inet_frags_fini(&lowpan_frags); }
1 649 66 32 282 373 1 83 204 2 1132 1187 1 28 1303 1318 1137 1135 1138 9 2 2 2217 21 1 394 186 512 266 146 3 6 379 19 5 109 173 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP module. * * Version: @(#)ip.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Changes: * Mike McLagan : Routing by source */ #ifndef _IP_H #define _IP_H #include <linux/types.h> #include <linux/ip.h> #include <linux/in.h> #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/sockptr.h> #include <linux/static_key.h> #include <net/inet_sock.h> #include <net/route.h> #include <net/snmp.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/netns/hash.h> #include <net/lwtunnel.h> #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ extern unsigned int sysctl_fib_sync_mem; extern unsigned int sysctl_fib_sync_mem_min; extern unsigned int sysctl_fib_sync_mem_max; struct sock; struct inet_skb_parm { int iif; struct ip_options opt; /* Compiled IP options */ u16 flags; #define IPSKB_FORWARDED BIT(0) #define IPSKB_XFRM_TUNNEL_SIZE BIT(1) #define IPSKB_XFRM_TRANSFORMED BIT(2) #define IPSKB_FRAG_COMPLETE BIT(3) #define IPSKB_REROUTED BIT(4) #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) u16 frag_max_size; }; static inline bool ipv4_l3mdev_skb(u16 flags) { return !!(flags & IPSKB_L3SLAVE); } static inline unsigned int ip_hdrlen(const struct sk_buff *skb) { return ip_hdr(skb)->ihl * 4; } struct ipcm_cookie { struct sockcm_cookie sockc; __be32 addr; int oif; struct ip_options_rcu *opt; __u8 protocol; __u8 ttl; __s16 tos; char priority; __u16 gso_size; }; static inline void ipcm_init(struct ipcm_cookie *ipcm) { *ipcm = (struct ipcm_cookie) { .tos = -1 }; } static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; } #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb)) /* return enslaved device index if relevant */ static inline int inet_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) return IPCB(skb)->iif; #endif return 0; } /* Special input handler for packets caught by router alert option. They are selected only by protocol field, and then processed likely local ones; but only if someone wants them! Otherwise, router not running rsvpd will kill RSVP. It is user level problem, what it will make with them. I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), but receiver should be enough clever f.e. to forward mtrace requests, sent to multicast group to reach destination designated router. */ struct ip_ra_chain { struct ip_ra_chain __rcu *next; struct sock *sk; union { void (*destructor)(struct sock *); struct sock *saved_sk; }; struct rcu_head rcu; }; /* IP flags. */ #define IP_CE 0x8000 /* Flag: "Congestion" */ #define IP_DF 0x4000 /* Flag: "Don't Fragment" */ #define IP_MF 0x2000 /* Flag: "More Fragments" */ #define IP_OFFSET 0x1FFF /* "Fragment Offset" part */ #define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */ struct msghdr; struct net_device; struct packet_type; struct rtable; struct sockaddr; int igmp_mc_init(void); /* * Functions provided by ip.c */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, u8 tos); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); struct ip_fraglist_iter { struct sk_buff *frag; struct iphdr *iph; int offset; unsigned int hlen; }; void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, unsigned int hlen, struct ip_fraglist_iter *iter); void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter); static inline struct sk_buff *ip_fraglist_next(struct ip_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip_frag_state { bool DF; unsigned int hlen; unsigned int ll_rs; unsigned int mtu; unsigned int left; int offset; int ptr; __be16 not_last_frag; }; void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state); struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int len, int protolen, struct ipcm_cookie *ipc, struct rtable **rt, unsigned int flags); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb); struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork); int ip_send_skb(struct net *net, struct sk_buff *skb); int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4); void ip_flush_pending_frames(struct sock *sk); struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags); int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) { return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } /* Get the route scope that should be used when sending a packet. */ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, const struct ipcm_cookie *ipc, const struct msghdr *msg) { if (sock_flag(&inet->sk, SOCK_LOCALROUTE) || msg->msg_flags & MSG_DONTROUTE || (ipc->opt && ipc->opt->opt.is_strictroute)) return RT_SCOPE_LINK; return RT_SCOPE_UNIVERSE; } static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) { return (ipc->tos != -1) ? RT_TOS(ipc->tos) : RT_TOS(READ_ONCE(inet->tos)); } /* datagram.c */ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void ip4_datagram_release_cb(struct sock *sk); struct ip_reply_arg { struct kvec iov[1]; int flags; __wsum csum; int csumoffset; /* u16 offset of csum in iov[0].iov_base */ /* -1 if not needed */ int bound_dev_if; u8 tos; kuid_t uid; }; #define IP_REPLY_ARG_NOSRCCHECK 1 static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) { return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define __IP_INC_STATS(net, field) __SNMP_INC_STATS64((net)->mib.ip_statistics, field) #define IP_ADD_STATS(net, field, val) SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val) #define __IP_ADD_STATS(net, field, val) __SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val) #define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val) #define __IP_UPD_PO_STATS(net, field, val) __SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val) #define NET_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.net_statistics, field) #define __NET_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.net_statistics, field) #define NET_ADD_STATS(net, field, adnd) SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) #define __NET_ADD_STATS(net, field, adnd) __SNMP_ADD_STATS((net)->mib.net_statistics, field, adnd) static inline u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) { return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); } unsigned long snmp_fold_field(void __percpu *mib, int offt); #if BITS_PER_LONG==32 u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, size_t syncp_offset); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off); #else static inline u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, size_t syncp_offset) { return snmp_get_cpu_field(mib, cpu, offct); } static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off) { return snmp_fold_field(mib, offt); } #endif #define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ { \ int i, c; \ for_each_possible_cpu(c) { \ for (i = 0; stats_list[i].name; i++) \ buff64[i] += snmp_get_cpu_field64( \ mib_statistic, \ c, stats_list[i].entry, \ offset); \ } \ } #define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ { \ int i, c; \ for_each_possible_cpu(c) { \ for (i = 0; stats_list[i].name; i++) \ buff[i] += snmp_get_cpu_field( \ mib_statistic, \ c, stats_list[i].entry); \ } \ } static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) { u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); *low = range & 0xffff; *high = range >> 16; } bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) { if (!net->ipv4.sysctl_local_reserved_ports) return false; return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } static inline bool sysctl_dev_name_is_allowed(const char *name) { return strcmp(name, "default") != 0 && strcmp(name, "all") != 0; } static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { return port < READ_ONCE(net->ipv4.sysctl_ip_prot_sock); } #else static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) { return false; } static inline bool inet_port_requires_bind_service(struct net *net, unsigned short port) { return port < PROT_SOCK; } #endif __be32 inet_current_timestamp(void); /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; extern int inet_peer_maxttl; void ipfrag_init(void); void ip_static_sysctl_init(void); #define IP4_REPLY_MARK(net, mark) \ (READ_ONCE((net)->ipv4.sysctl_fwmark_reflect) ? (mark) : 0) static inline bool ip_is_fragment(const struct iphdr *iph) { return (iph->frag_off & htons(IP_MF | IP_OFFSET)) != 0; } #ifdef CONFIG_INET #include <net/dst.h> /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ static inline int ip_decrease_ttl(struct iphdr *iph) { u32 check = (__force u32)iph->check; check += (__force u32)htons(0x0100); iph->check = (__force __sum16)(check + (check>=0xFFFF)); return --iph->ttl; } static inline int ip_mtu_locked(const struct dst_entry *dst) { const struct rtable *rt = dst_rtable(dst); return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU); } static inline int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc == IP_PMTUDISC_DO || (pmtudisc == IP_PMTUDISC_WANT && !ip_mtu_locked(dst)); } static inline bool ip_sk_accept_pmtu(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc != IP_PMTUDISC_INTERFACE && pmtudisc != IP_PMTUDISC_OMIT; } static inline bool ip_sk_use_pmtu(const struct sock *sk) { return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE; } static inline bool ip_sk_ignore_df(const struct sock *sk) { u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc); return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT; } static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); struct net *net = dev_net(dst->dev); unsigned int mtu; if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; if (mtu && time_before(jiffies, rt->dst.expires)) goto out; } /* 'forwarding = true' case should always honour route mtu */ mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { unsigned int mtu; if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack); static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics) { if (fib_metrics != &dst_default_metrics && refcount_dec_and_test(&fib_metrics->refcnt)) kfree(fib_metrics); } /* ipv4 and ipv6 both use refcounted metrics if it is not the default */ static inline void ip_dst_init_metrics(struct dst_entry *dst, struct dst_metrics *fib_metrics) { dst_init_metrics(dst, fib_metrics->metrics, true); if (fib_metrics != &dst_default_metrics) { dst->_metrics |= DST_METRICS_REFCOUNTED; refcount_inc(&fib_metrics->refcnt); } } static inline void ip_dst_metrics_put(struct dst_entry *dst) { struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) kfree(p); } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, struct sock *sk, int segs) { struct iphdr *iph = ip_hdr(skb); /* We had many attacks based on IPID, use the private * generator as much as we can. */ if (sk && inet_sk(sk)->inet_daddr) { int val; /* avoid atomic operations for TCP, * as we hold socket lock at this point. */ if (sk_is_tcp(sk)) { sock_owned_by_me(sk); val = atomic_read(&inet_sk(sk)->inet_id); atomic_set(&inet_sk(sk)->inet_id, val + segs); } else { val = atomic_add_return(segs, &inet_sk(sk)->inet_id); } iph->id = htons(val); return; } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { iph->id = 0; } else { /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } static inline void ip_select_ident(struct net *net, struct sk_buff *skb, struct sock *sk) { ip_select_ident_segs(net, skb, sk, 1); } static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto) { return csum_tcpudp_nofold(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len, proto, 0); } /* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v4addrs.src = iph->saddr; * flow->v4addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow, const struct iphdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) != offsetof(typeof(flow->addrs), v4addrs.src) + sizeof(flow->addrs.v4addrs.src)); memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } /* * Map a multicast IP onto multicast MAC for type ethernet. */ static inline void ip_eth_mc_map(__be32 naddr, char *buf) { __u32 addr=ntohl(naddr); buf[0]=0x01; buf[1]=0x00; buf[2]=0x5e; buf[5]=addr&0xFF; addr>>=8; buf[4]=addr&0xFF; addr>>=8; buf[3]=addr&0x7F; } /* * Map a multicast IP onto multicast MAC for type IP-over-InfiniBand. * Leave P_Key as 0 to be filled in by driver. */ static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf) { __u32 addr; unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; addr = ntohl(naddr); buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x40; /* IPv4 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; buf[10] = 0; buf[11] = 0; buf[12] = 0; buf[13] = 0; buf[14] = 0; buf[15] = 0; buf[19] = addr & 0xff; addr >>= 8; buf[18] = addr & 0xff; addr >>= 8; buf[17] = addr & 0xff; addr >>= 8; buf[16] = addr & 0x0f; } static inline void ip_ipgre_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) memcpy(buf, broadcast, 4); else memcpy(buf, &naddr, sizeof(naddr)); } #if IS_ENABLED(CONFIG_IPV6) #include <linux/ipv6.h> #endif static __inline__ void inet_reset_saddr(struct sock *sk) { inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); memset(&np->saddr, 0, sizeof(np->saddr)); memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr)); } #endif } #endif static inline unsigned int ipv4_addr_hash(__be32 ip) { return (__force unsigned int) ip; } static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) { return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } bool ip_call_ra_chain(struct sk_buff *skb); /* * Functions provided by ip_fragment.c */ enum ip_defrag_users { IP_DEFRAG_LOCAL_DELIVER, IP_DEFRAG_CALL_RA_CHAIN, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP_DEFRAG_CONNTRACK_OUT, __IP_DEFRAG_CONNTRACK_OUT_END = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, IP_DEFRAG_VS_IN, IP_DEFRAG_VS_OUT, IP_DEFRAG_VS_FWD, IP_DEFRAG_AF_PACKET, IP_DEFRAG_MACVLAN, }; /* Return true if the value of 'user' is between 'lower_bond' * and 'upper_bond' inclusively. */ static inline bool ip_defrag_user_in_between(u32 user, enum ip_defrag_users lower_bond, enum ip_defrag_users upper_bond) { return user >= lower_bond && user <= upper_bond; } int ip_defrag(struct net *net, struct sk_buff *skb, u32 user); #ifdef CONFIG_INET struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user); #else static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { return skb; } #endif /* * Functions provided by ip_forward.c */ int ip_forward(struct sk_buff *skb); /* * Functions provided by ip_options.c */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt); int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt); static inline int ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb) { return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt); } void ip_options_fragment(struct sk_buff *skb); int __ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb, __be32 *info); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); int ip_options_get(struct net *net, struct ip_options_rcu **optp, sockptr_t data, int optlen); void ip_options_undo(struct ip_options *opt); void ip_forward_options(struct sk_buff *skb); int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev); /* * Functions provided by ip_sockglue.c */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst); void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); DECLARE_STATIC_KEY_FALSE(ip4_min_ttl); int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int do_ip_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)); int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, u32 info); static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) { ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0); } bool icmp_global_allow(void); extern int sysctl_icmp_msgs_per_sec; extern int sysctl_icmp_msgs_burst; #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); #endif int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack); static inline bool inetdev_valid_mtu(unsigned int mtu) { return likely(mtu >= IPV4_MIN_MTU); } void ip_sock_set_freebind(struct sock *sk); int ip_sock_set_mtu_discover(struct sock *sk, int val); void ip_sock_set_pktinfo(struct sock *sk); void ip_sock_set_recverr(struct sock *sk); void ip_sock_set_tos(struct sock *sk, int val); void __ip_sock_set_tos(struct sock *sk, int val); #endif /* _IP_H */
5 4 13 1 11 4 3 4 4 4 25 29 5 1 6 3 4 4 3 5 1 1 2 2 1 1 25 1 5 2 2 2 5 3 3 3 5 2 2 1 29 29 2 17 9 1 20 19 1 1 1 3 3 26 25 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> * Copyright 2001-2006 Ian Kent <raven@themaw.net> */ #include "autofs_i.h" /* Check if a dentry can be expired */ static inline int autofs_can_expire(struct dentry *dentry, unsigned long timeout, unsigned int how) { struct autofs_info *ino = autofs_dentry_ino(dentry); /* dentry in the process of being deleted */ if (ino == NULL) return 0; if (!(how & AUTOFS_EXP_IMMEDIATE)) { /* Too young to die */ if (!timeout || time_after(ino->last_used + timeout, jiffies)) return 0; } return 1; } /* Check a mount point for busyness */ static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry, unsigned int how) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; int status = 1; pr_debug("dentry %p %pd\n", dentry, dentry); path_get(&path); if (!follow_down_one(&path)) goto done; if (is_autofs_dentry(path.dentry)) { struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb); /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) goto done; } /* Not a submount, has a forced expire been requested */ if (how & AUTOFS_EXP_FORCED) { status = 0; goto done; } /* Update the expiry counter if fs is busy */ if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; ino = autofs_dentry_ino(top); ino->last_used = jiffies; goto done; } status = 0; done: pr_debug("returning = %d\n", status); path_put(&path); return status; } /* p->d_lock held */ static struct dentry *positive_after(struct dentry *p, struct dentry *child) { child = child ? d_next_sibling(child) : d_first_child(p); hlist_for_each_entry_from(child, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { dget_dlock(child); spin_unlock(&child->d_lock); return child; } spin_unlock(&child->d_lock); } return NULL; } /* * Calculate and dget next entry in the subdirs list under root. */ static struct dentry *get_next_positive_subdir(struct dentry *prev, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct dentry *q; spin_lock(&sbi->lookup_lock); spin_lock(&root->d_lock); q = positive_after(root, prev); spin_unlock(&root->d_lock); spin_unlock(&sbi->lookup_lock); dput(prev); return q; } /* * Calculate and dget next entry in top down tree traversal. */ static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct dentry *p = prev, *ret = NULL, *d = NULL; if (prev == NULL) return dget(root); spin_lock(&sbi->lookup_lock); spin_lock(&p->d_lock); while (1) { struct dentry *parent; ret = positive_after(p, d); if (ret || p == root) break; parent = p->d_parent; spin_unlock(&p->d_lock); spin_lock(&parent->d_lock); d = p; p = parent; } spin_unlock(&p->d_lock); spin_unlock(&sbi->lookup_lock); dput(prev); return ret; } /* * Check a direct mount point for busyness. * Direct mounts have similar expiry semantics to tree mounts. * The tree is not busy iff no mountpoints are busy and there are no * autofs submounts. */ static int autofs_direct_busy(struct vfsmount *mnt, struct dentry *top, unsigned long timeout, unsigned int how) { pr_debug("top %p %pd\n", top, top); /* Forced expire, user space handles busy mounts */ if (how & AUTOFS_EXP_FORCED) return 0; /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { struct autofs_info *ino; ino = autofs_dentry_ino(top); if (ino) ino->last_used = jiffies; return 1; } /* Timeout of a direct mount is determined by its top dentry */ if (!autofs_can_expire(top, timeout, how)) return 1; return 0; } /* * Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ static int autofs_tree_busy(struct vfsmount *mnt, struct dentry *top, unsigned long timeout, unsigned int how) { struct autofs_info *top_ino = autofs_dentry_ino(top); struct dentry *p; pr_debug("top %p %pd\n", top, top); /* Negative dentry - give up */ if (!simple_positive(top)) return 1; p = NULL; while ((p = get_next_positive_dentry(p, top))) { pr_debug("dentry %p %pd\n", p, p); /* * Is someone visiting anywhere in the subtree ? * If there's no mount we need to check the usage * count for the autofs dentry. * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { if (autofs_mount_busy(mnt, p, how)) { top_ino->last_used = jiffies; dput(p); return 1; } } else { struct autofs_info *ino = autofs_dentry_ino(p); unsigned int ino_count = READ_ONCE(ino->count); /* allow for dget above and top is already dgot */ if (p == top) ino_count += 2; else ino_count++; if (d_count(p) > ino_count) { top_ino->last_used = jiffies; dput(p); return 1; } } } /* Forced expire, user space handles busy mounts */ if (how & AUTOFS_EXP_FORCED) return 0; /* Timeout of a tree mount is ultimately determined by its top dentry */ if (!autofs_can_expire(top, timeout, how)) return 1; return 0; } static struct dentry *autofs_check_leaves(struct vfsmount *mnt, struct dentry *parent, unsigned long timeout, unsigned int how) { struct dentry *p; pr_debug("parent %p %pd\n", parent, parent); p = NULL; while ((p = get_next_positive_dentry(p, parent))) { pr_debug("dentry %p %pd\n", p, p); if (d_mountpoint(p)) { /* Can we umount this guy */ if (autofs_mount_busy(mnt, p, how)) continue; /* This isn't a submount so if a forced expire * has been requested, user space handles busy * mounts */ if (how & AUTOFS_EXP_FORCED) return p; /* Can we expire this guy */ if (autofs_can_expire(p, timeout, how)) return p; } } return NULL; } /* Check if we can expire a direct mount (possibly a tree) */ static struct dentry *autofs_expire_direct(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, unsigned int how) { struct dentry *root = dget(sb->s_root); struct autofs_info *ino; unsigned long timeout; if (!root) return NULL; timeout = sbi->exp_timeout; if (!autofs_direct_busy(mnt, root, timeout, how)) { spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(root); /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); goto out; } ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); if (!autofs_direct_busy(mnt, root, timeout, how)) { spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); } out: dput(root); return NULL; } /* Check if 'dentry' should expire, or return a nearby * dentry that is suitable. * If returned dentry is different from arg dentry, * then a dget() reference was taken, else not. */ static struct dentry *should_expire(struct dentry *dentry, struct vfsmount *mnt, unsigned long timeout, unsigned int how) { struct autofs_info *ino = autofs_dentry_ino(dentry); unsigned int ino_count; /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) return NULL; /* * Case 1: (i) indirect mount or top level pseudo direct mount * (autofs-4.1). * (ii) indirect mount with offset mount, check the "/" * offset (autofs-5.0+). */ if (d_mountpoint(dentry)) { pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ if (autofs_mount_busy(mnt, dentry, how)) return NULL; /* This isn't a submount so if a forced expire * has been requested, user space handles busy * mounts */ if (how & AUTOFS_EXP_FORCED) return dentry; /* Can we expire this guy */ if (autofs_can_expire(dentry, timeout, how)) return dentry; return NULL; } if (d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); /* Forced expire, user space handles busy mounts */ if (how & AUTOFS_EXP_FORCED) return dentry; /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. */ if (autofs_can_expire(dentry, timeout, how)) return dentry; return NULL; } if (autofs_empty(ino)) return NULL; /* Case 2: tree mount, expire iff entire tree is not busy */ if (!(how & AUTOFS_EXP_LEAVES)) { /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } if (!autofs_tree_busy(mnt, dentry, timeout, how)) return dentry; /* * Case 3: pseudo direct mount, expire individual leaves * (autofs-4.1). */ } else { struct dentry *expired; /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } expired = autofs_check_leaves(mnt, dentry, timeout, how); if (expired) { if (expired == dentry) dput(dentry); return expired; } } return NULL; } /* * Find an eligible tree to time-out * A tree is eligible if :- * - it is unused by any user process * - it has been unused for exp_timeout time */ static struct dentry *autofs_expire_indirect(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, unsigned int how) { unsigned long timeout; struct dentry *root = sb->s_root; struct dentry *dentry; struct dentry *expired; struct dentry *found; struct autofs_info *ino; if (!root) return NULL; timeout = sbi->exp_timeout; dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(dentry); if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; } spin_unlock(&sbi->fs_lock); expired = should_expire(dentry, mnt, timeout, how); if (!expired) continue; spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); /* Make sure a reference is not taken on found if * things have changed. */ how &= ~AUTOFS_EXP_LEAVES; found = should_expire(expired, mnt, timeout, how); if (found != expired) { // something has changed, continue dput(found); goto next; } if (expired != dentry) dput(dentry); spin_lock(&sbi->fs_lock); goto found; next: spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); if (expired != dentry) dput(expired); } return NULL; found: pr_debug("returning %p %pd\n", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return expired; } int autofs_expire_wait(const struct path *path, int rcu_walk) { struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); int status; int state; /* Block on any pending expire */ if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) return 0; if (rcu_walk) return -ECHILD; retry: spin_lock(&sbi->fs_lock); state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); if (state == AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); /* * Possibly being selected for expire, wait until * it's selected or not. */ schedule_timeout_uninterruptible(HZ/10); goto retry; } if (state & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); status = autofs_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); if (d_unhashed(dentry)) return -EAGAIN; return status; } spin_unlock(&sbi->fs_lock); return 0; } /* Perform an expiry operation */ int autofs_expire_run(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; struct autofs_info *ino; struct dentry *dentry; int ret = 0; memset(&pkt, 0, sizeof(pkt)); pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; dentry = autofs_expire_indirect(sb, mnt, sbi, 0); if (!dentry) return -EAGAIN; pkt.len = dentry->d_name.len; memcpy(pkt.name, dentry->d_name.name, pkt.len); pkt.name[pkt.len] = '\0'; if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = jiffies; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); return ret; } int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, unsigned int how) { struct dentry *dentry; int ret = -EAGAIN; if (autofs_type_trigger(sbi->type)) dentry = autofs_expire_direct(sb, mnt, sbi, how); else dentry = autofs_expire_indirect(sb, mnt, sbi, how); if (dentry) { struct autofs_info *ino = autofs_dentry_ino(dentry); const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ ret = autofs_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = jiffies; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); } return ret; } /* * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more to be done. */ int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { unsigned int how = 0; if (arg && get_user(how, arg)) return -EFAULT; return autofs_do_expire_multi(sb, mnt, sbi, how); }
13 13 13 4 4 7 6 1 3 3 15 11 10 15 33 33 18 15 12 33 1 1 2 14 5 4 1 1 10 4 1 5 5 5 6 1 1 3 32 2 18 12 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 // SPDX-License-Identifier: GPL-2.0-only /* * drivers/dma-buf/sync_file.c * * Copyright (C) 2012 Google, Inc. */ #include <linux/dma-fence-unwrap.h> #include <linux/export.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> #include <linux/sync_file.h> #include <uapi/linux/sync_file.h> static const struct file_operations sync_file_fops; static struct sync_file *sync_file_alloc(void) { struct sync_file *sync_file; sync_file = kzalloc(sizeof(*sync_file), GFP_KERNEL); if (!sync_file) return NULL; sync_file->file = anon_inode_getfile("sync_file", &sync_file_fops, sync_file, 0); if (IS_ERR(sync_file->file)) goto err; init_waitqueue_head(&sync_file->wq); INIT_LIST_HEAD(&sync_file->cb.node); return sync_file; err: kfree(sync_file); return NULL; } static void fence_check_cb_func(struct dma_fence *f, struct dma_fence_cb *cb) { struct sync_file *sync_file; sync_file = container_of(cb, struct sync_file, cb); wake_up_all(&sync_file->wq); } /** * sync_file_create() - creates a sync file * @fence: fence to add to the sync_fence * * Creates a sync_file containg @fence. This function acquires and additional * reference of @fence for the newly-created &sync_file, if it succeeds. The * sync_file can be released with fput(sync_file->file). Returns the * sync_file or NULL in case of error. */ struct sync_file *sync_file_create(struct dma_fence *fence) { struct sync_file *sync_file; sync_file = sync_file_alloc(); if (!sync_file) return NULL; sync_file->fence = dma_fence_get(fence); return sync_file; } EXPORT_SYMBOL(sync_file_create); static struct sync_file *sync_file_fdget(int fd) { struct file *file = fget(fd); if (!file) return NULL; if (file->f_op != &sync_file_fops) goto err; return file->private_data; err: fput(file); return NULL; } /** * sync_file_get_fence - get the fence related to the sync_file fd * @fd: sync_file fd to get the fence from * * Ensures @fd references a valid sync_file and returns a fence that * represents all fence in the sync_file. On error NULL is returned. */ struct dma_fence *sync_file_get_fence(int fd) { struct sync_file *sync_file; struct dma_fence *fence; sync_file = sync_file_fdget(fd); if (!sync_file) return NULL; fence = dma_fence_get(sync_file->fence); fput(sync_file->file); return fence; } EXPORT_SYMBOL(sync_file_get_fence); /** * sync_file_get_name - get the name of the sync_file * @sync_file: sync_file to get the fence from * @buf: destination buffer to copy sync_file name into * @len: available size of destination buffer. * * Each sync_file may have a name assigned either by the user (when merging * sync_files together) or created from the fence it contains. In the latter * case construction of the name is deferred until use, and so requires * sync_file_get_name(). * * Returns: a string representing the name. */ char *sync_file_get_name(struct sync_file *sync_file, char *buf, int len) { if (sync_file->user_name[0]) { strscpy(buf, sync_file->user_name, len); } else { struct dma_fence *fence = sync_file->fence; snprintf(buf, len, "%s-%s%llu-%lld", fence->ops->get_driver_name(fence), fence->ops->get_timeline_name(fence), fence->context, fence->seqno); } return buf; } /** * sync_file_merge() - merge two sync_files * @name: name of new fence * @a: sync_file a * @b: sync_file b * * Creates a new sync_file which contains copies of all the fences in both * @a and @b. @a and @b remain valid, independent sync_file. Returns the * new merged sync_file or NULL in case of error. */ static struct sync_file *sync_file_merge(const char *name, struct sync_file *a, struct sync_file *b) { struct sync_file *sync_file; struct dma_fence *fence; sync_file = sync_file_alloc(); if (!sync_file) return NULL; fence = dma_fence_unwrap_merge(a->fence, b->fence); if (!fence) { fput(sync_file->file); return NULL; } sync_file->fence = fence; strscpy(sync_file->user_name, name, sizeof(sync_file->user_name)); return sync_file; } static int sync_file_release(struct inode *inode, struct file *file) { struct sync_file *sync_file = file->private_data; if (test_bit(POLL_ENABLED, &sync_file->flags)) dma_fence_remove_callback(sync_file->fence, &sync_file->cb); dma_fence_put(sync_file->fence); kfree(sync_file); return 0; } static __poll_t sync_file_poll(struct file *file, poll_table *wait) { struct sync_file *sync_file = file->private_data; poll_wait(file, &sync_file->wq, wait); if (list_empty(&sync_file->cb.node) && !test_and_set_bit(POLL_ENABLED, &sync_file->flags)) { if (dma_fence_add_callback(sync_file->fence, &sync_file->cb, fence_check_cb_func) < 0) wake_up_all(&sync_file->wq); } return dma_fence_is_signaled(sync_file->fence) ? EPOLLIN : 0; } static long sync_file_ioctl_merge(struct sync_file *sync_file, unsigned long arg) { int fd = get_unused_fd_flags(O_CLOEXEC); int err; struct sync_file *fence2, *fence3; struct sync_merge_data data; if (fd < 0) return fd; if (copy_from_user(&data, (void __user *)arg, sizeof(data))) { err = -EFAULT; goto err_put_fd; } if (data.flags || data.pad) { err = -EINVAL; goto err_put_fd; } fence2 = sync_file_fdget(data.fd2); if (!fence2) { err = -ENOENT; goto err_put_fd; } data.name[sizeof(data.name) - 1] = '\0'; fence3 = sync_file_merge(data.name, sync_file, fence2); if (!fence3) { err = -ENOMEM; goto err_put_fence2; } data.fence = fd; if (copy_to_user((void __user *)arg, &data, sizeof(data))) { err = -EFAULT; goto err_put_fence3; } fd_install(fd, fence3->file); fput(fence2->file); return 0; err_put_fence3: fput(fence3->file); err_put_fence2: fput(fence2->file); err_put_fd: put_unused_fd(fd); return err; } static int sync_fill_fence_info(struct dma_fence *fence, struct sync_fence_info *info) { strscpy(info->obj_name, fence->ops->get_timeline_name(fence), sizeof(info->obj_name)); strscpy(info->driver_name, fence->ops->get_driver_name(fence), sizeof(info->driver_name)); info->status = dma_fence_get_status(fence); info->timestamp_ns = dma_fence_is_signaled(fence) ? ktime_to_ns(dma_fence_timestamp(fence)) : ktime_set(0, 0); return info->status; } static long sync_file_ioctl_fence_info(struct sync_file *sync_file, unsigned long arg) { struct sync_fence_info *fence_info = NULL; struct dma_fence_unwrap iter; struct sync_file_info info; unsigned int num_fences; struct dma_fence *fence; int ret; __u32 size; if (copy_from_user(&info, (void __user *)arg, sizeof(info))) return -EFAULT; if (info.flags || info.pad) return -EINVAL; num_fences = 0; dma_fence_unwrap_for_each(fence, &iter, sync_file->fence) ++num_fences; /* * Passing num_fences = 0 means that userspace doesn't want to * retrieve any sync_fence_info. If num_fences = 0 we skip filling * sync_fence_info and return the actual number of fences on * info->num_fences. */ if (!info.num_fences) { info.status = dma_fence_get_status(sync_file->fence); goto no_fences; } else { info.status = 1; } if (info.num_fences < num_fences) return -EINVAL; size = num_fences * sizeof(*fence_info); fence_info = kzalloc(size, GFP_KERNEL); if (!fence_info) return -ENOMEM; num_fences = 0; dma_fence_unwrap_for_each(fence, &iter, sync_file->fence) { int status; status = sync_fill_fence_info(fence, &fence_info[num_fences++]); info.status = info.status <= 0 ? info.status : status; } if (copy_to_user(u64_to_user_ptr(info.sync_fence_info), fence_info, size)) { ret = -EFAULT; goto out; } no_fences: sync_file_get_name(sync_file, info.name, sizeof(info.name)); info.num_fences = num_fences; if (copy_to_user((void __user *)arg, &info, sizeof(info))) ret = -EFAULT; else ret = 0; out: kfree(fence_info); return ret; } static int sync_file_ioctl_set_deadline(struct sync_file *sync_file, unsigned long arg) { struct sync_set_deadline ts; if (copy_from_user(&ts, (void __user *)arg, sizeof(ts))) return -EFAULT; if (ts.pad) return -EINVAL; dma_fence_set_deadline(sync_file->fence, ns_to_ktime(ts.deadline_ns)); return 0; } static long sync_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct sync_file *sync_file = file->private_data; switch (cmd) { case SYNC_IOC_MERGE: return sync_file_ioctl_merge(sync_file, arg); case SYNC_IOC_FILE_INFO: return sync_file_ioctl_fence_info(sync_file, arg); case SYNC_IOC_SET_DEADLINE: return sync_file_ioctl_set_deadline(sync_file, arg); default: return -ENOTTY; } } static const struct file_operations sync_file_fops = { .release = sync_file_release, .poll = sync_file_poll, .unlocked_ioctl = sync_file_ioctl, .compat_ioctl = compat_ptr_ioctl, };
9 2 8 3 5 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6t_NPT.h> #include <linux/netfilter/x_tables.h> static int ip6t_npt_checkentry(const struct xt_tgchk_param *par) { struct ip6t_npt_tginfo *npt = par->targinfo; struct in6_addr pfx; __wsum src_sum, dst_sum; if (npt->src_pfx_len > 64 || npt->dst_pfx_len > 64) return -EINVAL; /* Ensure that LSB of prefix is zero */ ipv6_addr_prefix(&pfx, &npt->src_pfx.in6, npt->src_pfx_len); if (!ipv6_addr_equal(&pfx, &npt->src_pfx.in6)) return -EINVAL; ipv6_addr_prefix(&pfx, &npt->dst_pfx.in6, npt->dst_pfx_len); if (!ipv6_addr_equal(&pfx, &npt->dst_pfx.in6)) return -EINVAL; src_sum = csum_partial(&npt->src_pfx.in6, sizeof(npt->src_pfx.in6), 0); dst_sum = csum_partial(&npt->dst_pfx.in6, sizeof(npt->dst_pfx.in6), 0); npt->adjustment = ~csum_fold(csum_sub(src_sum, dst_sum)); return 0; } static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt, struct in6_addr *addr) { unsigned int pfx_len; unsigned int i, idx; __be32 mask; __sum16 sum; pfx_len = max(npt->src_pfx_len, npt->dst_pfx_len); for (i = 0; i < pfx_len; i += 32) { if (pfx_len - i >= 32) mask = 0; else mask = htonl((1 << (i - pfx_len + 32)) - 1); idx = i / 32; addr->s6_addr32[idx] &= mask; addr->s6_addr32[idx] |= ~mask & npt->dst_pfx.in6.s6_addr32[idx]; } if (pfx_len <= 48) idx = 3; else { for (idx = 4; idx < ARRAY_SIZE(addr->s6_addr16); idx++) { if ((__force __sum16)addr->s6_addr16[idx] != CSUM_MANGLED_0) break; } if (idx == ARRAY_SIZE(addr->s6_addr16)) return false; } sum = ~csum_fold(csum_add(csum_unfold((__force __sum16)addr->s6_addr16[idx]), csum_unfold(npt->adjustment))); if (sum == CSUM_MANGLED_0) sum = 0; *(__force __sum16 *)&addr->s6_addr16[idx] = sum; return true; } static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb, struct ipv6hdr *_bounced_hdr) { if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) return NULL; if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type)) return NULL; return skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct icmp6hdr), sizeof(struct ipv6hdr), _bounced_hdr); } static unsigned int ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; struct ipv6hdr _bounced_hdr; struct ipv6hdr *bounced_hdr; struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, saddr)); return NF_DROP; } /* rewrite dst addr of bounced packet which was sent to dst range */ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); if (bounced_hdr) { ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len); if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) ip6t_npt_map_pfx(npt, &bounced_hdr->daddr); } return XT_CONTINUE; } static unsigned int ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_npt_tginfo *npt = par->targinfo; struct ipv6hdr _bounced_hdr; struct ipv6hdr *bounced_hdr; struct in6_addr bounced_pfx; if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) { icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, daddr)); return NF_DROP; } /* rewrite src addr of bounced packet which was sent from dst range */ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr); if (bounced_hdr) { ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len); if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0) ip6t_npt_map_pfx(npt, &bounced_hdr->saddr); } return XT_CONTINUE; } static struct xt_target ip6t_npt_target_reg[] __read_mostly = { { .name = "SNPT", .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_POST_ROUTING), .me = THIS_MODULE, }, { .name = "DNPT", .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .usersize = offsetof(struct ip6t_npt_tginfo, adjustment), .checkentry = ip6t_npt_checkentry, .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init ip6t_npt_init(void) { return xt_register_targets(ip6t_npt_target_reg, ARRAY_SIZE(ip6t_npt_target_reg)); } static void __exit ip6t_npt_exit(void) { xt_unregister_targets(ip6t_npt_target_reg, ARRAY_SIZE(ip6t_npt_target_reg)); } module_init(ip6t_npt_init); module_exit(ip6t_npt_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6-to-IPv6 Network Prefix Translation (RFC 6296)"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ip6t_SNPT"); MODULE_ALIAS("ip6t_DNPT");
106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ #include <linux/rhashtable-types.h> #include <linux/completion.h> #include <linux/in6.h> #include <linux/rbtree_types.h> #include <linux/refcount.h> #include <net/dropreason-core.h> /* Per netns frag queues directory */ struct fqdir { /* sysctls */ long high_thresh; long low_thresh; int timeout; int max_dist; struct inet_frags *f; struct net *net; bool dead; struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ atomic_long_t mem ____cacheline_aligned_in_smp; struct work_struct destroy_work; struct llist_node free_list; }; /** * enum: fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed) */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), INET_FRAG_DROP = BIT(4), }; struct frag_v4_compare_key { __be32 saddr; __be32 daddr; u32 user; u32 vif; __be16 id; u16 protocol; }; struct frag_v6_compare_key { struct in6_addr saddr; struct in6_addr daddr; u32 user; __be32 id; u32 iif; }; /** * struct inet_frag_queue - fragment queue * * @node: rhash node * @key: keys identifying this frag. * @timer: queue expiration timer * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @tstamp_type: stamp has a mono delivery time (EDT) * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { struct rhash_head node; union { struct frag_v4_compare_key v4; struct frag_v6_compare_key v6; } key; struct timer_list timer; spinlock_t lock; refcount_t refcnt; struct rb_root rb_fragments; struct sk_buff *fragments_tail; struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; u8 tstamp_type; __u8 flags; u16 max_size; struct fqdir *fqdir; struct rcu_head rcu; }; struct inet_frags { unsigned int qsize; void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; struct rhashtable_params rhash_params; refcount_t refcnt; struct completion completion; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); static inline void fqdir_pre_exit(struct fqdir *fqdir) { /* Prevent creation of new frags. * Pairs with READ_ONCE() in inet_frag_find(). */ WRITE_ONCE(fqdir->high_thresh, 0); /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() * and ip6frag_expire_frag_queue(). */ WRITE_ONCE(fqdir->dead, true); } void fqdir_exit(struct fqdir *fqdir); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ unsigned int inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason); static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) inet_frag_destroy(q); } /* Memory Tracking Functions. */ static inline long frag_mem_limit(const struct fqdir *fqdir) { return atomic_long_read(&fqdir->mem); } static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_sub(val, &fqdir->mem); } static inline void add_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_add(val, &fqdir->mem); } /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. */ #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ extern const u8 ip_frag_ecn_table[16]; /* Return values of inet_frag_queue_insert() */ #define IPFRAG_OK 0 #define IPFRAG_DUP 1 #define IPFRAG_OVERLAP 2 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); #endif
8 24 27 41 52 59 212 65 34 1 61 7 95 145 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 /* * net/tipc/trace.h: TIPC tracepoints * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tipc #if !defined(_TIPC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TIPC_TRACE_H #include <linux/tracepoint.h> #include "core.h" #include "link.h" #include "socket.h" #include "node.h" #define SKB_LMIN (100) #define SKB_LMAX (SKB_LMIN * 2) #define LIST_LMIN (SKB_LMIN * 3) #define LIST_LMAX (SKB_LMIN * 11) #define SK_LMIN (SKB_LMIN * 2) #define SK_LMAX (SKB_LMIN * 11) #define LINK_LMIN (SKB_LMIN) #define LINK_LMAX (SKB_LMIN * 16) #define NODE_LMIN (SKB_LMIN) #define NODE_LMAX (SKB_LMIN * 11) #ifndef __TIPC_TRACE_ENUM #define __TIPC_TRACE_ENUM enum { TIPC_DUMP_NONE = 0, TIPC_DUMP_TRANSMQ = 1, TIPC_DUMP_BACKLOGQ = (1 << 1), TIPC_DUMP_DEFERDQ = (1 << 2), TIPC_DUMP_INPUTQ = (1 << 3), TIPC_DUMP_WAKEUP = (1 << 4), TIPC_DUMP_SK_SNDQ = (1 << 8), TIPC_DUMP_SK_RCVQ = (1 << 9), TIPC_DUMP_SK_BKLGQ = (1 << 10), TIPC_DUMP_ALL = 0xffffu }; #endif /* Link & Node FSM states: */ #define state_sym(val) \ __print_symbolic(val, \ {(0xe), "ESTABLISHED" },\ {(0xe << 4), "ESTABLISHING" },\ {(0x1 << 8), "RESET" },\ {(0x2 << 12), "RESETTING" },\ {(0xd << 16), "PEER_RESET" },\ {(0xf << 20), "FAILINGOVER" },\ {(0xc << 24), "SYNCHING" },\ {(0xdd), "SELF_DOWN_PEER_DOWN" },\ {(0xaa), "SELF_UP_PEER_UP" },\ {(0xd1), "SELF_DOWN_PEER_LEAVING" },\ {(0xac), "SELF_UP_PEER_COMING" },\ {(0xca), "SELF_COMING_PEER_UP" },\ {(0x1d), "SELF_LEAVING_PEER_DOWN" },\ {(0xf0), "FAILINGOVER" },\ {(0xcc), "SYNCHING" }) /* Link & Node FSM events: */ #define evt_sym(val) \ __print_symbolic(val, \ {(0xec1ab1e), "ESTABLISH_EVT" },\ {(0x9eed0e), "PEER_RESET_EVT" },\ {(0xfa110e), "FAILURE_EVT" },\ {(0x10ca1d0e), "RESET_EVT" },\ {(0xfa110bee), "FAILOVER_BEGIN_EVT" },\ {(0xfa110ede), "FAILOVER_END_EVT" },\ {(0xc1ccbee), "SYNCH_BEGIN_EVT" },\ {(0xc1ccede), "SYNCH_END_EVT" },\ {(0xece), "SELF_ESTABL_CONTACT_EVT" },\ {(0x1ce), "SELF_LOST_CONTACT_EVT" },\ {(0x9ece), "PEER_ESTABL_CONTACT_EVT" },\ {(0x91ce), "PEER_LOST_CONTACT_EVT" },\ {(0xfbe), "FAILOVER_BEGIN_EVT" },\ {(0xfee), "FAILOVER_END_EVT" },\ {(0xcbe), "SYNCH_BEGIN_EVT" },\ {(0xcee), "SYNCH_END_EVT" }) /* Bearer, net device events: */ #define dev_evt_sym(val) \ __print_symbolic(val, \ {(NETDEV_CHANGE), "NETDEV_CHANGE" },\ {(NETDEV_GOING_DOWN), "NETDEV_GOING_DOWN" },\ {(NETDEV_UP), "NETDEV_UP" },\ {(NETDEV_CHANGEMTU), "NETDEV_CHANGEMTU" },\ {(NETDEV_CHANGEADDR), "NETDEV_CHANGEADDR" },\ {(NETDEV_UNREGISTER), "NETDEV_UNREGISTER" },\ {(NETDEV_CHANGENAME), "NETDEV_CHANGENAME" }) extern unsigned long sysctl_tipc_sk_filter[5] __read_mostly; int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf); int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf); int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf); int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf); int tipc_node_dump(struct tipc_node *n, bool more, char *buf); bool tipc_sk_filtering(struct sock *sk); DECLARE_EVENT_CLASS(tipc_skb_class, TP_PROTO(struct sk_buff *skb, bool more, const char *header), TP_ARGS(skb, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? SKB_LMAX : SKB_LMIN) ), TP_fast_assign( __assign_str(header); tipc_skb_dump(skb, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ) #define DEFINE_SKB_EVENT(name) \ DEFINE_EVENT(tipc_skb_class, name, \ TP_PROTO(struct sk_buff *skb, bool more, const char *header), \ TP_ARGS(skb, more, header)) DEFINE_SKB_EVENT(tipc_skb_dump); DEFINE_SKB_EVENT(tipc_proto_build); DEFINE_SKB_EVENT(tipc_proto_rcv); DECLARE_EVENT_CLASS(tipc_list_class, TP_PROTO(struct sk_buff_head *list, bool more, const char *header), TP_ARGS(list, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? LIST_LMAX : LIST_LMIN) ), TP_fast_assign( __assign_str(header); tipc_list_dump(list, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ); #define DEFINE_LIST_EVENT(name) \ DEFINE_EVENT(tipc_list_class, name, \ TP_PROTO(struct sk_buff_head *list, bool more, const char *header), \ TP_ARGS(list, more, header)) DEFINE_LIST_EVENT(tipc_list_dump); DECLARE_EVENT_CLASS(tipc_sk_class, TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, const char *header), TP_ARGS(sk, skb, dqueues, header), TP_STRUCT__entry( __string(header, header) __field(u32, portid) __dynamic_array(char, buf, (dqueues) ? SK_LMAX : SK_LMIN) __dynamic_array(char, skb_buf, (skb) ? SKB_LMIN : 1) ), TP_fast_assign( __assign_str(header); __entry->portid = tipc_sock_get_portid(sk); tipc_sk_dump(sk, dqueues, __get_str(buf)); if (skb) tipc_skb_dump(skb, false, __get_str(skb_buf)); else *(__get_str(skb_buf)) = '\0'; ), TP_printk("<%u> %s\n%s%s", __entry->portid, __get_str(header), __get_str(skb_buf), __get_str(buf)) ); #define DEFINE_SK_EVENT_FILTER(name) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk))) DEFINE_SK_EVENT_FILTER(tipc_sk_dump); DEFINE_SK_EVENT_FILTER(tipc_sk_create); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmcast); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmsg); DEFINE_SK_EVENT_FILTER(tipc_sk_sendstream); DEFINE_SK_EVENT_FILTER(tipc_sk_poll); DEFINE_SK_EVENT_FILTER(tipc_sk_filter_rcv); DEFINE_SK_EVENT_FILTER(tipc_sk_advance_rx); DEFINE_SK_EVENT_FILTER(tipc_sk_rej_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_drop_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_release); DEFINE_SK_EVENT_FILTER(tipc_sk_shutdown); #define DEFINE_SK_EVENT_FILTER_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk) && (cond))) DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit1, tipc_sk_overlimit1(sk, skb)); DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit2, tipc_sk_overlimit2(sk, skb)); DECLARE_EVENT_CLASS(tipc_link_class, TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), TP_ARGS(l, dqueues, header), TP_STRUCT__entry( __string(header, header) __array(char, name, TIPC_MAX_LINK_NAME) __dynamic_array(char, buf, (dqueues) ? LINK_LMAX : LINK_LMIN) ), TP_fast_assign( __assign_str(header); memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), TP_printk("<%s> %s\n%s", __entry->name, __get_str(header), __get_str(buf)) ); #define DEFINE_LINK_EVENT(name) \ DEFINE_EVENT(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header)) DEFINE_LINK_EVENT(tipc_link_dump); DEFINE_LINK_EVENT(tipc_link_conges); DEFINE_LINK_EVENT(tipc_link_timeout); DEFINE_LINK_EVENT(tipc_link_reset); #define DEFINE_LINK_EVENT_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header), \ TP_CONDITION(cond)) DEFINE_LINK_EVENT_COND(tipc_link_too_silent, tipc_link_too_silent(l)); DECLARE_EVENT_CLASS(tipc_link_transmq_class, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_STRUCT__entry( __array(char, name, TIPC_MAX_LINK_NAME) __field(u16, from) __field(u16, to) __field(u32, len) __field(u16, fseqno) __field(u16, lseqno) ), TP_fast_assign( memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME); __entry->from = f; __entry->to = t; __entry->len = skb_queue_len(tq); __entry->fseqno = __entry->len ? msg_seqno(buf_msg(skb_peek(tq))) : 0; __entry->lseqno = __entry->len ? msg_seqno(buf_msg(skb_peek_tail(tq))) : 0; ), TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_CONDITION(less_eq(f, t)) ); DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DECLARE_EVENT_CLASS(tipc_node_class, TP_PROTO(struct tipc_node *n, bool more, const char *header), TP_ARGS(n, more, header), TP_STRUCT__entry( __string(header, header) __field(u32, addr) __dynamic_array(char, buf, (more) ? NODE_LMAX : NODE_LMIN) ), TP_fast_assign( __assign_str(header); __entry->addr = tipc_node_get_addr(n); tipc_node_dump(n, more, __get_str(buf)); ), TP_printk("<%x> %s\n%s", __entry->addr, __get_str(header), __get_str(buf)) ); #define DEFINE_NODE_EVENT(name) \ DEFINE_EVENT(tipc_node_class, name, \ TP_PROTO(struct tipc_node *n, bool more, const char *header), \ TP_ARGS(n, more, header)) DEFINE_NODE_EVENT(tipc_node_dump); DEFINE_NODE_EVENT(tipc_node_create); DEFINE_NODE_EVENT(tipc_node_delete); DEFINE_NODE_EVENT(tipc_node_lost_contact); DEFINE_NODE_EVENT(tipc_node_timeout); DEFINE_NODE_EVENT(tipc_node_link_up); DEFINE_NODE_EVENT(tipc_node_link_down); DEFINE_NODE_EVENT(tipc_node_reset_links); DEFINE_NODE_EVENT(tipc_node_check_state); DECLARE_EVENT_CLASS(tipc_fsm_class, TP_PROTO(const char *name, u32 os, u32 ns, int evt), TP_ARGS(name, os, ns, evt), TP_STRUCT__entry( __string(name, name) __field(u32, os) __field(u32, ns) __field(u32, evt) ), TP_fast_assign( __assign_str(name); __entry->os = os; __entry->ns = ns; __entry->evt = evt; ), TP_printk("<%s> %s--(%s)->%s\n", __get_str(name), state_sym(__entry->os), evt_sym(__entry->evt), state_sym(__entry->ns)) ); #define DEFINE_FSM_EVENT(fsm_name) \ DEFINE_EVENT(tipc_fsm_class, fsm_name, \ TP_PROTO(const char *name, u32 os, u32 ns, int evt), \ TP_ARGS(name, os, ns, evt)) DEFINE_FSM_EVENT(tipc_link_fsm); DEFINE_FSM_EVENT(tipc_node_fsm); TRACE_EVENT(tipc_l2_device_event, TP_PROTO(struct net_device *dev, struct tipc_bearer *b, unsigned long evt), TP_ARGS(dev, b, evt), TP_STRUCT__entry( __string(dev_name, dev->name) __string(b_name, b->name) __field(unsigned long, evt) __field(u8, b_up) __field(u8, carrier) __field(u8, oper) ), TP_fast_assign( __assign_str(dev_name); __assign_str(b_name); __entry->evt = evt; __entry->b_up = test_bit(0, &b->up); __entry->carrier = netif_carrier_ok(dev); __entry->oper = netif_oper_up(dev); ), TP_printk("%s on: <%s>/<%s> oper: %s carrier: %s bearer: %s\n", dev_evt_sym(__entry->evt), __get_str(dev_name), __get_str(b_name), (__entry->oper) ? "up" : "down", (__entry->carrier) ? "ok" : "notok", (__entry->b_up) ? "up" : "down") ); #endif /* _TIPC_TRACE_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
75 193 2 22 132 133 2 1 38 114 114 114 96 96 96 96 115 40 91 34 7 71 34 81 30 98 74 74 242 164 137 96 115 39 232 4380 4373 198 4381 686 128 3 125 126 4 128 90 90 86 17 90 100 80 39 110 44 15 37 13 2 70 2 8 4 4 69 57 4 54 46 10 112 113 15 100 137 59 113 5 113 112 3 2 108 7 5 4 1 3 101 100 1 100 94 6 98 99 12 69 7 73 32 1 16 16 15 36 4 8 12 25 32 25 3 22 1 22 22 22 5 17 16 16 16 8 8 5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mlock.c * * (C) Copyright 1995 Linus Torvalds * (C) Copyright 2002 Christoph Hellwig */ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/sched/user.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/pagewalk.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/export.h> #include <linux/rmap.h> #include <linux/mmzone.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/secretmem.h> #include "internal.h" struct mlock_fbatch { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) return true; if (capable(CAP_IPC_LOCK)) return true; return false; } EXPORT_SYMBOL(can_do_mlock); /* * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it * will be ostensibly placed on the LRU "unevictable" list (actually no such * list exists), rather than the [in]active lists. PG_unevictable is set to * indicate the unevictable state. */ static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ if (!folio_test_clear_lru(folio)) return lruvec; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (unlikely(folio_evictable(folio))) { /* * This is a little surprising, but quite possible: PG_mlocked * must have got cleared already by another CPU. Could this * folio be unevictable? I'm not sure, but move it now if so. */ if (folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, folio_nr_pages(folio)); } goto out; } if (folio_test_unevictable(folio)) { if (folio_test_mlocked(folio)) folio->mlock_count++; goto out; } lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: folio_set_lru(folio); return lruvec; } static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ if (unlikely(folio_evictable(folio))) goto out; folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: lruvec_add_folio(lruvec, folio); folio_set_lru(folio); return lruvec; } static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { int nr_pages = folio_nr_pages(folio); bool isolated = false; if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ if (folio->mlock_count) folio->mlock_count--; if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: if (folio_test_clear_mlocked(folio)) { __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* folio_evictable() has to be checked *after* clearing Mlocked */ if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) folio_set_lru(folio); return lruvec; } /* * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ #define LRU_FOLIO 0x1 #define NEW_FOLIO 0x2 static inline struct folio *mlock_lru(struct folio *folio) { return (struct folio *)((unsigned long)folio + LRU_FOLIO); } static inline struct folio *mlock_new(struct folio *folio) { return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can * make use of such folio pointer flags in future, but for now just keep it for * mlock. We could use three separate folio batches instead, but one feels * better (munlocking a full folio batch does not need to drain mlocking folio * batches first). */ static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; struct folio *folio; int i; for (i = 0; i < folio_batch_count(fbatch); i++) { folio = fbatch->folios[i]; mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); folio = (struct folio *)((unsigned long)folio - mlock); fbatch->folios[i] = folio; if (mlock & LRU_FOLIO) lruvec = __mlock_folio(folio, lruvec); else if (mlock & NEW_FOLIO) lruvec = __mlock_new_folio(folio, lruvec); else lruvec = __munlock_folio(folio, lruvec); } if (lruvec) unlock_page_lruvec_irq(lruvec); folios_put(fbatch); } void mlock_drain_local(void) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); } bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** * mlock_folio - mlock a folio already on (or temporarily off) LRU * @folio: folio to be mlocked. */ void mlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * mlock_new_folio - mlock a newly allocated folio not yet on LRU * @folio: folio to be mlocked, either normal or a THP head. */ void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); folio_set_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * munlock_folio - munlock a folio * @folio: folio to be munlocked, either normal or a THP head. */ void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), * which will check whether the folio is multiply mlocked. */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; unsigned int count = (end - addr) >> PAGE_SHIFT; pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL, NULL, NULL); } static inline bool allow_mlock_munlock(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned int step) { /* * For unlock, allow munlock large folio which is partially * mapped to VMA. As it's possible that large folio is * mlocked and VMA is split later. * * During memory pressure, such kind of large folio can * be split. And the pages are not in VM_LOCKed VMA * can be reclaimed. */ if (!(vma->vm_flags & VM_LOCKED)) return true; /* folio_within_range() cannot take KSM, but any small folio is OK */ if (!folio_test_large(folio)) return true; /* folio not in range [start, end), skip mlock */ if (!folio_within_range(folio, vma, start, end)) return false; /* folio is not fully mapped, skip mlock */ if (step != folio_nr_pages(folio)) return false; return true; } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; unsigned int step = 1; unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (!pmd_present(*pmd)) goto out; if (is_huge_zero_pmd(*pmd)) goto out; folio = pmd_folio(*pmd); if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); goto out; } start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) { walk->action = ACTION_AGAIN; return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; step = folio_mlock_step(folio, pte, addr, end); if (!allow_mlock_munlock(folio, vma, start, end, step)) goto next_entry; if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); next_entry: pte += step - 1; addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: spin_unlock(ptl); cond_resched(); return 0; } /* * mlock_vma_pages_range() - mlock any pages already in the range, * or munlock all pages in the range. * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma * @newflags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t newflags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, .walk_lock = PGWALK_WRLOCK_VERIFY, }; /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ if (newflags & VM_LOCKED) newflags |= VM_IO; vma_start_write(vma); vm_flags_reset_once(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); if (newflags & VM_IO) { newflags &= ~VM_IO; vm_flags_reset_once(vma, newflags); } } /* * mlock_fixup - handle mlock[all]/munlock[all] requests. * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } /* * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; /* * vm_flags is protected by the mmap_lock held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } out: *prev = vma; return ret; } static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { int error; vm_flags_t newflags; if (vma->vm_start != tmp) return -ENOMEM; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) return error; tmp = vma_iter_end(&vmi); nstart = tmp; } if (tmp < end) return -ENOMEM; return 0; } /* * Go through vma areas and sum size of mlocked * vma pages, as return value. * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) * is also counted. * Return value: previously mlocked page counts */ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; unsigned long count = 0; unsigned long end; VMA_ITERATOR(vmi, mm, start); /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); if (end < vma->vm_end) { count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; } } return count >> PAGE_SHIFT; } /* * convert get_user_pages() return value to posix mlock() error */ static int __mlock_posix_error_return(long retval) { if (retval == -EFAULT) retval = -ENOMEM; else if (retval == -ENOMEM) retval = -EAGAIN; return retval; } static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; int error = -ENOMEM; start = untagged_addr(start); if (!can_do_mlock()) return -EPERM; len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; locked += current->mm->locked_vm; if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { /* * It is possible that the regions requested intersect with * previously mlocked areas, that part area in "mm->locked_vm" * should not be counted to new mlock increment count. So check * and adjust locked count if necessary. */ locked -= count_mm_mlocked_page_nr(current->mm, start, len); } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = apply_vma_lock_flags(start, len, flags); mmap_write_unlock(current->mm); if (error) return error; error = __mm_populate(start, len, 0); if (error) return __mlock_posix_error_return(error); return 0; } SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { return do_mlock(start, len, VM_LOCKED); } SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) { vm_flags_t vm_flags = VM_LOCKED; if (flags & ~MLOCK_ONFAULT) return -EINVAL; if (flags & MLOCK_ONFAULT) vm_flags |= VM_LOCKONFAULT; return do_mlock(start, len, vm_flags); } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; start = untagged_addr(start); len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_vma_lock_flags(start, len, 0); mmap_write_unlock(current->mm); return ret; } /* * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) * and translate into the appropriate modifications to mm->def_flags and/or the * flags for all current VMAs. * * There are a couple of subtleties with this. If mlockall() is called multiple * times with different flags, the values do not necessarily stack. If mlockall * is called once including the MCL_FUTURE flag and then a second time without * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. */ static int apply_mlockall_flags(int flags) { VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; if (flags & MCL_ONFAULT) current->mm->def_flags |= VM_LOCKONFAULT; if (!(flags & MCL_CURRENT)) goto out; } if (flags & MCL_CURRENT) { to_add |= VM_LOCKED; if (flags & MCL_ONFAULT) to_add |= VM_LOCKONFAULT; } for_each_vma(vmi, vma) { vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, newflags); cond_resched(); } out: return 0; } SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret; if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || flags == MCL_ONFAULT) return -EINVAL; if (!can_do_mlock()) return -EPERM; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); mmap_write_unlock(current->mm); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); return ret; } SYSCALL_DEFINE0(munlockall) { int ret; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_mlockall_flags(0); mmap_write_unlock(current->mm); return ret; } /* * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB * shm segments) get accounted against the user_struct instead. */ static DEFINE_SPINLOCK(shmlock_user_lock); int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; } if (!get_ucounts(ucounts)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); allowed = 0; goto out; } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); put_ucounts(ucounts); }
29 29 29 29 29 29 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 // SPDX-License-Identifier: GPL-2.0-only /* * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ #include <linux/export.h> #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/cc_platform.h> #include <asm/processor-flags.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/memtype.h> #include "mtrr.h" struct fixed_range_block { int base_msr; /* start address of an MTRR block */ int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; struct cache_map { u64 start; u64 end; u64 flags; u64 type:8; u64 fixed:1; }; bool mtrr_debug; static int __init mtrr_param_setup(char *str) { int rc = 0; if (!str) return -EINVAL; if (!strcmp(str, "debug")) mtrr_debug = true; else rc = -EINVAL; return rc; } early_param("mtrr", mtrr_param_setup); /* * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where * no 2 adjacent ranges have the same cache mode (those would be merged). * The number is based on the worst case: * - no two adjacent fixed MTRRs share the same cache mode * - one variable MTRR is spanning a huge area with mode WB * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2 * additional ranges each (result like "ababababa...aba" with a = WB, b = UC), * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries * to the possible maximum, as it always starts at 4GB, thus it can't be in * the middle of that MTRR, unless that MTRR starts at 0, which would remove * the initial "a" from the "abababa" pattern above) * The map won't contain ranges with no matching MTRR (those fall back to the * default cache mode). */ #define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2) static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata; static struct cache_map *cache_map __refdata = init_cache_map; static unsigned int cache_map_size = CACHE_MAP_MAX; static unsigned int cache_map_n; static unsigned int cache_map_fixed; static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); /* Reserved bits in the high portion of the MTRRphysBaseN MSR. */ u32 phys_hi_rsvd; /* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set * to 1 during BIOS initialization of the fixed MTRRs, then cleared to * 0 for operation." */ static inline void k8_check_syscfg_dram_mod_en(void) { u32 lo, hi; if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0x0f))) return; if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); } } /* Get the size of contiguous MTRR range */ static u64 get_mtrr_size(u64 mask) { u64 size; mask |= (u64)phys_hi_rsvd << 32; size = -mask; return size; } static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size) { struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg; if (!(mtrr->mask_lo & MTRR_PHYSMASK_V)) return MTRR_TYPE_INVALID; *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK); *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) + (mtrr->mask_lo & PAGE_MASK)); return mtrr->base_lo & MTRR_PHYSBASE_TYPE; } static u8 get_effective_type(u8 type1, u8 type2) { if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE) return MTRR_TYPE_UNCACHABLE; if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK)) return MTRR_TYPE_WRTHROUGH; if (type1 != type2) return MTRR_TYPE_UNCACHABLE; return type1; } static void rm_map_entry_at(int idx) { cache_map_n--; if (cache_map_n > idx) { memmove(cache_map + idx, cache_map + idx + 1, sizeof(*cache_map) * (cache_map_n - idx)); } } /* * Add an entry into cache_map at a specific index. Merges adjacent entries if * appropriate. Return the number of merges for correcting the scan index * (this is needed as merging will reduce the number of entries, which will * result in skipping entries in future iterations if the scan index isn't * corrected). * Note that the corrected index can never go below -1 (resulting in being 0 in * the next scan iteration), as "2" is returned only if the current index is * larger than zero. */ static int add_map_entry_at(u64 start, u64 end, u8 type, int idx) { bool merge_prev = false, merge_next = false; if (start >= end) return 0; if (idx > 0) { struct cache_map *prev = cache_map + idx - 1; if (!prev->fixed && start == prev->end && type == prev->type) merge_prev = true; } if (idx < cache_map_n) { struct cache_map *next = cache_map + idx; if (!next->fixed && end == next->start && type == next->type) merge_next = true; } if (merge_prev && merge_next) { cache_map[idx - 1].end = cache_map[idx].end; rm_map_entry_at(idx); return 2; } if (merge_prev) { cache_map[idx - 1].end = end; return 1; } if (merge_next) { cache_map[idx].start = start; return 1; } /* Sanity check: the array should NEVER be too small! */ if (cache_map_n == cache_map_size) { WARN(1, "MTRR cache mode memory map exhausted!\n"); cache_map_n = cache_map_fixed; return 0; } if (cache_map_n > idx) { memmove(cache_map + idx + 1, cache_map + idx, sizeof(*cache_map) * (cache_map_n - idx)); } cache_map[idx].start = start; cache_map[idx].end = end; cache_map[idx].type = type; cache_map[idx].fixed = 0; cache_map_n++; return 0; } /* Clear a part of an entry. Return 1 if start of entry is still valid. */ static int clr_map_range_at(u64 start, u64 end, int idx) { int ret = start != cache_map[idx].start; u64 tmp; if (start == cache_map[idx].start && end == cache_map[idx].end) { rm_map_entry_at(idx); } else if (start == cache_map[idx].start) { cache_map[idx].start = end; } else if (end == cache_map[idx].end) { cache_map[idx].end = start; } else { tmp = cache_map[idx].end; cache_map[idx].end = start; add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1); } return ret; } /* * Add MTRR to the map. The current map is scanned and each part of the MTRR * either overlapping with an existing entry or with a hole in the map is * handled separately. */ static void add_map_entry(u64 start, u64 end, u8 type) { u8 new_type, old_type; u64 tmp; int i; for (i = 0; i < cache_map_n && start < end; i++) { if (start >= cache_map[i].end) continue; if (start < cache_map[i].start) { /* Region start has no overlap. */ tmp = min(end, cache_map[i].start); i -= add_map_entry_at(start, tmp, type, i); start = tmp; continue; } new_type = get_effective_type(type, cache_map[i].type); old_type = cache_map[i].type; if (cache_map[i].fixed || new_type == old_type) { /* Cut off start of new entry. */ start = cache_map[i].end; continue; } /* Handle only overlapping part of region. */ tmp = min(end, cache_map[i].end); i += clr_map_range_at(start, tmp, i); i -= add_map_entry_at(start, tmp, new_type, i); start = tmp; } /* Add rest of region after last map entry (rest might be empty). */ add_map_entry_at(start, end, type, i); } /* Add variable MTRRs to cache map. */ static void map_add_var(void) { u64 start, size; unsigned int i; u8 type; /* * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it * needs to be added again when rebuilding the map due to potentially * having moved as a result of variable MTRRs for memory below 4GB. */ if (mtrr_tom2) { add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK); cache_map[cache_map_n - 1].fixed = 1; } for (i = 0; i < num_var_ranges; i++) { type = get_var_mtrr_state(i, &start, &size); if (type != MTRR_TYPE_INVALID) add_map_entry(start, start + size, type); } } /* * Rebuild map by replacing variable entries. Needs to be called when MTRR * registers are being changed after boot, as such changes could include * removals of registers, which are complicated to handle without rebuild of * the map. */ void generic_rebuild_map(void) { if (mtrr_if != &generic_mtrr_ops) return; cache_map_n = cache_map_fixed; map_add_var(); } static unsigned int __init get_cache_map_size(void) { return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0); } /* Build the cache_map containing the cache modes per memory range. */ void __init mtrr_build_map(void) { u64 start, end, size; unsigned int i; u8 type; /* Add fixed MTRRs, optimize for adjacent entries with same type. */ if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) { /* * Start with 64k size fixed entries, preset 1st one (hence the * loop below is starting with index 1). */ start = 0; end = size = 0x10000; type = mtrr_state.fixed_ranges[0]; for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) { /* 8 64k entries, then 16 16k ones, rest 4k. */ if (i == 8 || i == 24) size >>= 2; if (mtrr_state.fixed_ranges[i] != type) { add_map_entry(start, end, type); start = end; type = mtrr_state.fixed_ranges[i]; } end += size; } add_map_entry(start, end, type); } /* Mark fixed, they take precedence. */ for (i = 0; i < cache_map_n; i++) cache_map[i].fixed = 1; cache_map_fixed = cache_map_n; map_add_var(); pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n", cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed, get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0)); if (mtrr_debug) { for (i = 0; i < cache_map_n; i++) { pr_info("%3u: %016llx-%016llx %s\n", i, cache_map[i].start, cache_map[i].end - 1, mtrr_attrib_to_str(cache_map[i].type)); } } } /* Copy the cache_map from __initdata memory to dynamically allocated one. */ void __init mtrr_copy_map(void) { unsigned int new_size = get_cache_map_size(); if (!mtrr_state.enabled || !new_size) { cache_map = NULL; return; } mutex_lock(&mtrr_mutex); cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL); if (cache_map) { memmove(cache_map, init_cache_map, cache_map_n * sizeof(*cache_map)); cache_map_size = new_size; } else { mtrr_state.enabled = 0; pr_err("MTRRs disabled due to allocation failure for lookup map.\n"); } mutex_unlock(&mtrr_mutex); } /** * mtrr_overwrite_state - set static MTRR state * * Used to set MTRR state via different means (e.g. with data obtained from * a hypervisor). * Is allowed only for special cases when running virtualized. Must be called * from the x86_init.hyper.init_platform() hook. It can be called only once. * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR * is cleared. * * @var: MTRR variable range array to use * @num_var: length of the @var array * @def_type: default caching type */ void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, mtrr_type def_type) { unsigned int i; /* Only allowed to be called once before mtrr_bp_init(). */ if (WARN_ON_ONCE(mtrr_state_set)) return; /* Only allowed when running virtualized. */ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) return; /* * Only allowed for special virtualization cases: * - when running as Hyper-V, SEV-SNP guest using vTOM * - when running as Xen PV guest * - when running as SEV-SNP or TDX guest to avoid unnecessary * VMM communication/Virtualization exceptions (#VC, #VE) */ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !hv_is_isolation_supported() && !cpu_feature_enabled(X86_FEATURE_XENPV) && !cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) return; /* Disable MTRR in order to disable MTRR modifications. */ setup_clear_cpu_cap(X86_FEATURE_MTRR); if (var) { if (num_var > MTRR_MAX_VAR_RANGES) { pr_warn("Trying to overwrite MTRR state with %u variable entries\n", num_var); num_var = MTRR_MAX_VAR_RANGES; } for (i = 0; i < num_var; i++) mtrr_state.var_ranges[i] = var[i]; num_var_ranges = num_var; } mtrr_state.def_type = def_type; mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED; mtrr_state_set = 1; } static u8 type_merge(u8 type, u8 new_type, u8 *uniform) { u8 effective_type; if (type == MTRR_TYPE_INVALID) return new_type; effective_type = get_effective_type(type, new_type); if (type != effective_type) *uniform = 0; return effective_type; } /** * mtrr_type_lookup - look up memory type in MTRR * * @start: Begin of the physical address range * @end: End of the physical address range * @uniform: output argument: * - 1: the returned MTRR type is valid for the whole region * - 0: otherwise * * Return Values: * MTRR_TYPE_(type) - The effective MTRR type for the region * MTRR_TYPE_INVALID - MTRR is disabled */ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { u8 type = MTRR_TYPE_INVALID; unsigned int i; if (!mtrr_state_set) { /* Uniformity is unknown. */ *uniform = 0; return MTRR_TYPE_UNCACHABLE; } *uniform = 1; if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)) return MTRR_TYPE_UNCACHABLE; for (i = 0; i < cache_map_n && start < end; i++) { /* Region after current map entry? -> continue with next one. */ if (start >= cache_map[i].end) continue; /* Start of region not covered by current map entry? */ if (start < cache_map[i].start) { /* At least some part of region has default type. */ type = type_merge(type, mtrr_state.def_type, uniform); /* End of region not covered, too? -> lookup done. */ if (end <= cache_map[i].start) return type; } /* At least part of region covered by map entry. */ type = type_merge(type, cache_map[i].type, uniform); start = cache_map[i].end; } /* End of region past last entry in map? -> use default type. */ if (start < end) type = type_merge(type, mtrr_state.def_type, uniform); return type; } /* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } /* Fill the MSR pair relating to a var range */ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) { struct mtrr_var_range *vr; vr = mtrr_state.var_ranges; vr[index].base_lo = base_lo; vr[index].base_hi = base_hi; vr[index].mask_lo = mask_lo; vr[index].mask_hi = mask_hi; } static void get_fixed_ranges(mtrr_type *frs) { unsigned int *p = (unsigned int *)frs; int i; k8_check_syscfg_dram_mod_en(); rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) { if (boot_cpu_has(X86_FEATURE_MTRR)) get_fixed_ranges(mtrr_state.fixed_ranges); } static unsigned __initdata last_fixed_start; static unsigned __initdata last_fixed_end; static mtrr_type __initdata last_fixed_type; static void __init print_fixed_last(void) { if (!last_fixed_end) return; pr_info(" %05X-%05X %s\n", last_fixed_start, last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } static void __init update_fixed_last(unsigned base, unsigned end, mtrr_type type) { last_fixed_start = base; last_fixed_end = end; last_fixed_type = type; } static void __init print_fixed(unsigned base, unsigned step, const mtrr_type *types) { unsigned i; for (i = 0; i < 8; ++i, ++types, base += step) { if (last_fixed_end == 0) { update_fixed_last(base, base + step, *types); continue; } if (last_fixed_end == base && last_fixed_type == *types) { last_fixed_end = base + step; continue; } /* new segments: gap or different type */ print_fixed_last(); update_fixed_last(base, base + step, *types); } } static void __init print_mtrr_state(void) { unsigned int i; int high_width; pr_info("MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { pr_info("MTRR fixed ranges %sabled:\n", ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); /* tail */ print_fixed_last(); } pr_info("MTRR variable ranges %sabled:\n", mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V) pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", i, high_width, mtrr_state.var_ranges[i].base_hi, mtrr_state.var_ranges[i].base_lo >> 12, high_width, mtrr_state.var_ranges[i].mask_hi, mtrr_state.var_ranges[i].mask_lo >> 12, mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & MTRR_PHYSBASE_TYPE)); else pr_info(" %u disabled\n", i); } if (mtrr_tom2) pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } /* Grab all of the MTRR state for this CPU into *state */ bool __init get_mtrr_state(void) { struct mtrr_var_range *vrs; unsigned lo, dummy; unsigned int i; vrs = mtrr_state.var_ranges; rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = lo & MTRR_CAP_FIX; for (i = 0; i < num_var_ranges; i++) get_mtrr_var_range(i, &vrs[i]); if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE; mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT; if (amd_special_default_mtrr()) { unsigned low, high; /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); mtrr_tom2 = high; mtrr_tom2 <<= 32; mtrr_tom2 |= low; mtrr_tom2 &= 0xffffff800000ULL; } if (mtrr_debug) print_mtrr_state(); mtrr_state_set = 1; return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED); } /* Some BIOS's are messed up and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); pr_info("mtrr: probably your BIOS does not setup all CPUs.\n"); pr_info("mtrr: corrected configuration.\n"); } /* * Doesn't attempt to pass an error out to MTRR users * because it's quite complicated in some cases and probably not * worth it because the best error handling is to ignore it. */ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { if (wrmsr_safe(msr, a, b) < 0) { pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); } } /** * set_fixed_range - checks & updates a fixed-range MTRR if it * differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have */ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { unsigned lo, hi; rdmsr(msr, lo, hi); if (lo != msrwords[0] || hi != msrwords[1]) { mtrr_wrmsr(msr, msrwords[0], msrwords[1]); *changed = true; } } /** * generic_get_free_region - Get a free MTRR. * @base: The starting (base) address of the region. * @size: The size (in bytes) of the region. * @replace_reg: mtrr index to be replaced; set to invalid value if none. * * Returns: The index of the region on success, else negative on error. */ int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) { unsigned long lbase, lsize; mtrr_type ltype; int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, &ltype); if (lsize == 0) return i; } return -ENOSPC; } static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { u32 mask_lo, mask_hi, base_lo, base_hi; unsigned int hi; u64 tmp, mask; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ get_cpu(); rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if (!(mask_lo & MTRR_PHYSMASK_V)) { /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; goto out_put_cpu; } rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask: */ tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK); mask = (u64)phys_hi_rsvd << 32 | tmp; /* Expand tmp with high bits to all 1s: */ hi = fls64(tmp); if (hi > 0) { tmp |= ~((1ULL<<(hi - 1)) - 1); if (tmp != mask) { pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); mask = tmp; } } /* * This works correctly if size is a power of two, i.e. a * contiguous range: */ *size = -mask >> PAGE_SHIFT; *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & MTRR_PHYSBASE_TYPE; out_put_cpu: put_cpu(); } /** * set_fixed_ranges - checks & updates the fixed-range MTRRs if they * differ from the saved set * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ static int set_fixed_ranges(mtrr_type *frs) { unsigned long long *saved = (unsigned long long *)frs; bool changed = false; int block = -1, range; k8_check_syscfg_dram_mod_en(); while (fixed_range_blocks[++block].ranges) { for (range = 0; range < fixed_range_blocks[block].ranges; range++) set_fixed_range(fixed_range_blocks[block].base_msr + range, &changed, (unsigned int *)saved++); } return changed; } /* * Set the MSR pair relating to a var range. * Returns true if changes are made. */ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; bool changed = false; rdmsr(MTRRphysBase_MSR(index), lo, hi); if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD) || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; } rdmsr(MTRRphysMask_MSR(index), lo, hi); if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD) || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); changed = true; } return changed; } static u32 deftype_lo, deftype_hi; /** * set_mtrr_state - Set the MTRR state for this CPU. * * NOTE: The CPU must already be in a safe state for MTRR changes, including * measures that only a single CPU can be active in set_mtrr_state() in * order to not be subject to races for usage of deftype_lo. This is * accomplished by taking cache_disable_lock. * RETURNS: 0 if no changes made, else a mask indicating what was changed. */ static unsigned long set_mtrr_state(void) { unsigned long change_mask = 0; unsigned int i; for (i = 0; i < num_var_ranges; i++) { if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; } if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; /* * Set_mtrr_restore restores the old value of MTRRdefType, * so to set it we fiddle with the saved value: */ if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type || ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) { deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) | mtrr_state.def_type | (mtrr_state.enabled << MTRR_STATE_SHIFT); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } return change_mask; } void mtrr_disable(void) { /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi); } void mtrr_enable(void) { /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); } void mtrr_generic_set_state(void) { unsigned long mask, count; /* Actually set the state */ mask = set_mtrr_state(); /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof(mask) * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } } /** * generic_set_mtrr - set variable MTRR register on the local CPU. * * @reg: The register to set. * @base: The base address of the region. * @size: The size of the region. If this is 0 the region is disabled. * @type: The type of the region. * * Returns nothing. */ static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { unsigned long flags; struct mtrr_var_range *vr; vr = &mtrr_state.var_ranges[reg]; local_irq_save(flags); cache_disable(); if (size == 0) { /* * The invalid bit is kept in the mask, so we simply * clear the relevant mask register to disable a range. */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { vr->base_lo = base << PAGE_SHIFT | type; vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V; vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd; mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); } cache_enable(); local_irq_restore(flags); } int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { unsigned long lbase, last; /* * For Intel PPro stepping <= 7 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF */ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } /* * Check upper bits of base and last are equal and lower bits are 0 * for base and 1 for last */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); lbase = lbase >> 1, last = last >> 1) ; if (lbase != last) { pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; } static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); return config & MTRR_CAP_WC; } int positive_have_wrcomb(void) { return 1; } /* * Generic structure... */ const struct mtrr_ops generic_mtrr_ops = { .get = generic_get_mtrr, .get_free_region = generic_get_free_region, .set = generic_set_mtrr, .validate_add_page = generic_validate_add_page, .have_wrcomb = generic_have_wrcomb, };
29 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SR-IPv6 implementation * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #ifndef _NET_SEG6_H #define _NET_SEG6_H #include <linux/net.h> #include <linux/ipv6.h> #include <linux/seg6.h> #include <linux/rhashtable-types.h> static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) { __be32 diff[] = { ~from, to }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, __be32 *to) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], to[0], to[1], to[2], to[3], }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } struct seg6_pernet_data { struct mutex lock; struct in6_addr __rcu *tun_src; #ifdef CONFIG_IPV6_SEG6_HMAC struct rhashtable hmac_infos; #endif }; static inline struct seg6_pernet_data *seg6_pernet(struct net *net) { #if IS_ENABLED(CONFIG_IPV6) return net->ipv6.seg6_data; #else return NULL; #endif } extern int seg6_init(void); extern void seg6_exit(void); #ifdef CONFIG_IPV6_SEG6_LWTUNNEL extern int seg6_iptunnel_init(void); extern void seg6_iptunnel_exit(void); extern int seg6_local_init(void); extern void seg6_local_exit(void); #else static inline int seg6_iptunnel_init(void) { return 0; } static inline void seg6_iptunnel_exit(void) {} static inline int seg6_local_init(void) { return 0; } static inline void seg6_local_exit(void) {} #endif extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced); extern struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags); extern void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto); extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); /* If the packet which invoked an ICMP error contains an SRH return * the true destination address from within the SRH, otherwise use the * destination address in the IP header. */ static inline const struct in6_addr *seg6_get_daddr(struct sk_buff *skb, struct inet6_skb_parm *opt) { struct ipv6_sr_hdr *srh; if (opt->flags & IP6SKB_SEG6) { srh = (struct ipv6_sr_hdr *)(skb->data + opt->srhoff); return &srh->segments[0]; } return NULL; } #endif
620 294 9 614 620 620 618 294 9 615 830 483 361 800 797 28 1 452 360 795 620 617 300 617 620 300 825 2 825 828 2 828 750 751 823 827 826 443 401 18 64 604 122 177 412 104 104 312 62 1 1 1 1 1 1 1 1 1 86 87 3 84 87 14 14 64 2 62 1 58 3 56 8 8 993 936 81 80 989 496 498 497 3 941 939 776 278 940 938 282 274 7 687 539 234 688 686 12 11 2 489 6 327 725 14 14 354 515 44 313 2 11 344 363 10 353 352 1 354 3 3 3 3 3 715 790 436 363 73 718 707 12 718 57 798 824 827 80 756 826 826 1071 1071 829 816 15 829 336 826 821 15 827 829 342 359 359 21 47 44 4 4 4 2 2 4 2 2 28 12 17 27 21 2 12 16 1112 1111 1109 511 512 510 2 511 511 10 36 36 38 38 2 2 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_generic.c Generic packet scheduler routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 * - Ingress support */ #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <net/hotdata.h> #include <trace/events/qdisc.h> #include <trace/events/net.h> #include <net/xfrm.h> /* Qdisc to use by default */ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; EXPORT_SYMBOL(default_qdisc_ops); static void qdisc_maybe_clear_missed(struct Qdisc *q, const struct netdev_queue *txq) { clear_bit(__QDISC_STATE_MISSED, &q->state); /* Make sure the below netif_xmit_frozen_or_stopped() * checking happens after clearing STATE_MISSED. */ smp_mb__after_atomic(); /* Checking netif_xmit_frozen_or_stopped() again to * make sure STATE_MISSED is set if the STATE_MISSED * set by netif_tx_wake_queue()'s rescheduling of * net_tx_action() is cleared by the above clear_bit(). */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); else set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with * qdisc_lock(qdisc) spinlock. * * The idea is the following: * - enqueue, dequeue are serialized via qdisc root lock * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL) static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) { const struct netdev_queue *txq = q->dev_queue; spinlock_t *lock = NULL; struct sk_buff *skb; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->skb_bad_txq); if (skb) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = SKB_XOFF_MAGIC; qdisc_maybe_clear_missed(q, txq); } } if (lock) spin_unlock(lock); return skb; } static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) { struct sk_buff *skb = skb_peek(&q->skb_bad_txq); if (unlikely(skb)) skb = __skb_dequeue_bad_txq(q); return skb; } static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, struct sk_buff *skb) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } __skb_queue_tail(&q->skb_bad_txq, skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } if (lock) spin_unlock(lock); } static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); /* it's still part of the queue */ if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_requeues_inc(q); qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } skb = next; } if (lock) { spin_unlock(lock); set_bit(__QDISC_STATE_MISSED, &q->state); } else { __netif_schedule(q); } } static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, int *packets) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); if (!nskb) break; bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; (*packets)++; /* GSO counts as one pkt */ } skb_mark_not_on_list(skb); } /* This variant of try_bulk_dequeue_skb() makes sure * all skbs in the chain are for the same txq */ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, struct sk_buff *skb, int *packets) { int mapping = skb_get_queue_mapping(skb); struct sk_buff *nskb; int cnt = 0; do { nskb = q->dequeue(q); if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { qdisc_enqueue_skb_bad_txq(q, nskb); break; } skb->next = nskb; skb = nskb; } while (++cnt < 8); (*packets) += cnt; skb_mark_not_on_list(skb); } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; *packets = 1; if (unlikely(!skb_queue_empty(&q->gso_skb))) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->gso_skb); /* skb may be null if another cpu pulls gso_skb off in between * empty check and lock. */ if (!skb) { if (lock) spin_unlock(lock); goto validate; } /* skb in gso_skb were already validated */ *validate = false; if (xfrm_offload(skb)) *validate = true; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = NULL; qdisc_maybe_clear_missed(q, txq); } if (lock) spin_unlock(lock); goto trace; } validate: *validate = true; if ((q->flags & TCQ_F_ONETXQUEUE) && netif_xmit_frozen_or_stopped(txq)) { qdisc_maybe_clear_missed(q, txq); return skb; } skb = qdisc_dequeue_skb_bad_txq(q); if (unlikely(skb)) { if (skb == SKB_XOFF_MAGIC) return NULL; goto bulk; } skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) try_bulk_dequeue_skb(q, skb, txq, packets); else try_bulk_dequeue_skb_slow(q, skb, packets); } trace: trace_qdisc_dequeue(q, txq, *packets, skb); return skb; } /* * Transmit possibly several skbs, and handle the return status as * required. Owning qdisc running bit guarantees that only one CPU * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff * true - feel free to send more pkts */ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; bool again = false; /* And release qdisc */ if (root_lock) spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) skb = validate_xmit_skb_list(skb, dev, &again); #ifdef CONFIG_XFRM_OFFLOAD if (unlikely(again)) { if (root_lock) spin_lock(root_lock); dev_requeue_skb(skb, q); return false; } #endif if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); else qdisc_maybe_clear_missed(q, txq); HARD_TX_UNLOCK(dev, txq); } else { if (root_lock) spin_lock(root_lock); return true; } if (root_lock) spin_lock(root_lock); if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); dev_requeue_skb(skb, q); return false; } return true; } /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * * netif_tx_lock serializes accesses to device driver. * * qdisc_lock(q) and netif_tx_lock are mutually exclusive, * if one is grabbed, another must be free. * * Note, that this procedure can be called by a watchdog timer * * Returns to the caller: * 0 - queue is empty or throttled. * >0 - queue is not empty. * */ static inline bool qdisc_restart(struct Qdisc *q, int *packets) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) return false; if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) { int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) set_bit(__QDISC_STATE_MISSED, &q->state); else __netif_schedule(q); break; } } } unsigned long dev_trans_start(struct net_device *dev) { unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); unsigned long val; unsigned int i; for (i = 1; i < dev->num_tx_queues; i++) { val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) res = val; } return res; } EXPORT_SYMBOL(dev_trans_start); static void netif_freeze_queues(struct net_device *dev) { unsigned int i; int cpu; cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* We are the only thread of execution doing a * freeze, but we have to grab the _xmit_lock in * order to synchronize with threads which are in * the ->hard_start_xmit() handler and already * checked the frozen bit. */ __netif_tx_lock(txq, cpu); set_bit(__QUEUE_STATE_FROZEN, &txq->state); __netif_tx_unlock(txq); } } void netif_tx_lock(struct net_device *dev) { spin_lock(&dev->tx_global_lock); netif_freeze_queues(dev); } EXPORT_SYMBOL(netif_tx_lock); static void netif_unfreeze_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* No need to grab the _xmit_lock here. If the * queue is not stopped for another reason, we * force a schedule. */ clear_bit(__QUEUE_STATE_FROZEN, &txq->state); netif_schedule_queue(txq); } } void netif_tx_unlock(struct net_device *dev) { netif_unfreeze_queues(dev); spin_unlock(&dev->tx_global_lock); } EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { struct net_device *dev = from_timer(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); if (!qdisc_tx_is_noop(dev)) { if (netif_device_present(dev) && netif_running(dev) && netif_carrier_ok(dev)) { unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); trans_start = READ_ONCE(txq->trans_start); if (!netif_xmit_stopped(txq)) continue; if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } if (time_after(oldest_start, trans_start)) oldest_start = trans_start; } if (unlikely(timedout_ms)) { trace_net_dev_xmit_timeout(dev, i); netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", raw_smp_processor_id(), i, timedout_ms); netif_freeze_queues(dev); dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } } spin_unlock(&dev->tx_global_lock); if (release) netdev_put(dev, &dev->watchdog_dev_tracker); } void __netdev_watchdog_up(struct net_device *dev) { if (dev->netdev_ops->ndo_tx_timeout) { if (dev->watchdog_timeo <= 0) dev->watchdog_timeo = 5*HZ; if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) netdev_hold(dev, &dev->watchdog_dev_tracker, GFP_ATOMIC); } } EXPORT_SYMBOL_GPL(__netdev_watchdog_up); static void dev_watchdog_up(struct net_device *dev) { __netdev_watchdog_up(dev); } static void dev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); if (del_timer(&dev->watchdog_timer)) netdev_put(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } /** * netif_carrier_on - set carrier * @dev: network device * * Device has detected acquisition of carrier. */ void netif_carrier_on(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) __netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); /** * netif_carrier_off - clear carrier * @dev: network device * * Device has detected loss of carrier. */ void netif_carrier_off(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } } EXPORT_SYMBOL(netif_carrier_off); /** * netif_carrier_event - report carrier state event * @dev: network device * * Device has detected a carrier event but the carrier state wasn't changed. * Use in drivers when querying carrier state asynchronously, to avoid missing * events (link flaps) if link recovers before it's queried. */ void netif_carrier_event(struct net_device *dev) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } EXPORT_SYMBOL_GPL(netif_carrier_event); /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. */ static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { dev_core_stats_tx_dropped_inc(skb->dev); __qdisc_drop(skb, to_free); return NET_XMIT_CN; } static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } struct Qdisc_ops noop_qdisc_ops __read_mostly = { .id = "noop", .priv_size = 0, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; static struct netdev_queue noop_netdev_queue = { RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc), }; struct Qdisc noop_qdisc = { .enqueue = noop_enqueue, .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), }, .skb_bad_txq = { .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { /* register_qdisc() assigns a default of noop_enqueue if unset, * but __dev_queue_xmit() treats noqueue only as such * if this is NULL - so clear it here. */ qdisc->enqueue = NULL; return 0; } struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; EXPORT_SYMBOL(sch_default_prio2band); /* 3-band FIFO queue: old style, but should be a bit faster than generic prio+fifo combination. */ #define PFIFO_FAST_BANDS 3 /* * Private data for a pfifo_fast scheduler containing: * - rings for priority bands */ struct pfifo_fast_priv { struct skb_array q[PFIFO_FAST_BANDS]; }; static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, int band) { return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct skb_array *q = band2list(priv, band); unsigned int pkt_len = qdisc_pkt_len(skb); int err; err = skb_array_produce(q, skb); if (unlikely(err)) { if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else return qdisc_drop(skb, qdisc, to_free); } qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; bool need_retry = true; int band; retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); if (__skb_array_empty(q)) continue; skb = __skb_array_consume(q); } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. */ smp_mb__after_atomic(); need_retry = false; goto retry; } return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; int band; for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); skb = __skb_array_peek(q); } return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (band = 0; band < PFIFO_FAST_BANDS; band++) { struct skb_array *q = band2list(priv, band); struct sk_buff *skb; /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; while ((skb = __skb_array_consume(q)) != NULL) kfree_skb(skb); } if (qdisc_is_percpu_stats(qdisc)) { for_each_possible_cpu(i) { struct gnet_stats_queue *q; q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; q->qlen = 0; } } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) { struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); int prio; /* guard against zero length rings */ if (!qlen) return -EINVAL; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); int err; err = skb_array_init(q, qlen, GFP_KERNEL); if (err) return -ENOMEM; } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } static void pfifo_fast_destroy(struct Qdisc *sch) { struct pfifo_fast_priv *priv = qdisc_priv(sch); int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; /* Destroy ring but no need to kfree_skb because a call to * pfifo_fast_reset() has already done that work. */ ptr_ring_cleanup(&q->ring, NULL); } } static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, unsigned int new_len) { struct pfifo_fast_priv *priv = qdisc_priv(sch); struct skb_array *bands[PFIFO_FAST_BANDS]; int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); bands[prio] = q; } return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), .enqueue = pfifo_fast_enqueue, .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .change_tx_queue_len = pfifo_fast_change_tx_queue_len, .owner = THIS_MODULE, .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) { struct Qdisc *sch; unsigned int size = sizeof(*sch) + ops->priv_size; int err = -ENOBUFS; struct net_device *dev; if (!dev_queue) { NL_SET_ERR_MSG(extack, "No device queue given"); err = -EINVAL; goto errout; } dev = dev_queue->dev; sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); if (!sch) goto errout; __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); lockdep_register_key(&sch->root_lock_key); spin_lock_init(&sch->q.lock); lockdep_set_class(&sch->q.lock, &sch->root_lock_key); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!sch->cpu_qstats) { free_percpu(sch->cpu_bstats); goto errout1; } } spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); return sch; errout1: lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, unsigned int parentid, struct netlink_ext_ack *extack) { struct Qdisc *sch; if (!try_module_get(ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { module_put(ops->owner); return NULL; } sch->parent = parentid; if (!ops->init || ops->init(sch, NULL, extack) == 0) { trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; } qdisc_put(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); /* Under qdisc_lock(qdisc) and BH! */ void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; trace_qdisc_reset(qdisc); if (ops->reset) ops->reset(qdisc); __skb_queue_purge(&qdisc->gso_skb); __skb_queue_purge(&qdisc->skb_bad_txq); qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); } kfree(qdisc); } static void qdisc_free_cb(struct rcu_head *head) { struct Qdisc *q = container_of(head, struct Qdisc, rcu); qdisc_free(q); } static void __qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; struct net_device *dev = qdisc_dev(qdisc); #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); qdisc_reset(qdisc); if (ops->destroy) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); module_put(ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); call_rcu(&qdisc->rcu, qdisc_free_cb); } void qdisc_destroy(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; __qdisc_destroy(qdisc); } void qdisc_put(struct Qdisc *qdisc) { if (!qdisc) return; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); } EXPORT_SYMBOL(qdisc_put); /* Version of qdisc_put() that is called with rtnl mutex unlocked. * Intended to be used as optimization, this function only takes rtnl lock if * qdisc reference counter reached zero. */ void qdisc_put_unlocked(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_rtnl_lock(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); rtnl_unlock(); } EXPORT_SYMBOL(qdisc_put_unlocked); /* Attach toplevel qdisc to device queue. */ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc) { struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping); spinlock_t *root_lock; root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); spin_unlock_bh(root_lock); return oqdisc; } EXPORT_SYMBOL(dev_graft_qdisc); static void shutdown_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); struct Qdisc *qdisc_default = _qdisc_default; if (qdisc) { rcu_assign_pointer(dev_queue->qdisc, qdisc_default); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default); qdisc_put(qdisc); } } static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; const struct Qdisc_ops *ops = default_qdisc_ops; if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; else if(dev->type == ARPHRD_CAN) ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) return; if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } static void attach_default_qdiscs(struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; txq = netdev_get_tx_queue(dev, 0); if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED if (qdisc != &noop_qdisc) qdisc_hash_add(qdisc, false); #endif } static void transition_one_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_need_watchdog) { struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); int *need_watchdog_p = _need_watchdog; if (!(new_qdisc->flags & TCQ_F_BUILTIN)) clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); if (need_watchdog_p) { WRITE_ONCE(dev_queue->trans_start, 0); *need_watchdog_p = 1; } } void dev_activate(struct net_device *dev) { int need_watchdog; /* No queueing discipline is attached to device; * create default one for devices, which need queueing * and noqueue_qdisc for virtual interfaces */ if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) /* Delay activation until next carrier-on event */ return; need_watchdog = 0; netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); if (dev_ingress_queue(dev)) transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { netif_trans_update(dev); dev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); static void qdisc_deactivate(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); } static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) { struct Qdisc *qdisc_default = _qdisc_default; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, qdisc_default); } } static void dev_reset_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; bool nolock; qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); if (!qdisc) return; nolock = qdisc->flags & TCQ_F_NOLOCK; if (nolock) spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); if (nolock) { clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); } } static bool some_qdisc_is_busy(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; spinlock_t *root_lock; struct Qdisc *q; int val; dev_queue = netdev_get_tx_queue(dev, i); q = rtnl_dereference(dev_queue->qdisc_sleeping); root_lock = qdisc_lock(q); spin_lock_bh(root_lock); val = (qdisc_is_running(q) || test_bit(__QDISC_STATE_SCHED, &q->state)); spin_unlock_bh(root_lock); if (val) return true; } return false; } /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate * * This function returns only when all outstanding transmissions * have completed, unless all devices are in dismantle phase. */ void dev_deactivate_many(struct list_head *head) { struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc); dev_watchdog_down(dev); } /* Wait for outstanding qdisc-less dev_queue_xmit calls or * outstanding qdisc enqueuing calls. * This is avoided if all devices are in dismantle phase : * Caller will call synchronize_net() for us */ synchronize_net(); list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); if (dev_ingress_queue(dev)) dev_reset_queue(dev, dev_ingress_queue(dev), NULL); } /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) { /* wait_event() would avoid this sleep-loop but would * require expensive checks in the fast paths of packet * processing which isn't worth it. */ schedule_timeout_uninterruptible(1); } } } void dev_deactivate(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_deactivate_many(&single); list_del(&single); } EXPORT_SYMBOL(dev_deactivate); static int qdisc_change_tx_queue_len(struct net_device *dev, struct netdev_queue *dev_queue) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); const struct Qdisc_ops *ops = qdisc->ops; if (ops->change_tx_queue_len) return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); return 0; } void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); } void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) { #ifdef CONFIG_NET_SCHED struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int i; for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); /* Only update the default qdiscs we created, * qdiscs with handles are always hashed. */ if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_del(qdisc); } for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_add(qdisc, false); } #endif } EXPORT_SYMBOL(mq_change_real_num_tx); int dev_qdisc_change_tx_queue_len(struct net_device *dev) { bool up = dev->flags & IFF_UP; unsigned int i; int ret = 0; if (up) dev_deactivate(dev); for (i = 0; i < dev->num_tx_queues; i++) { ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); /* TODO: revert changes on a partial failure */ if (ret) break; } if (up) dev_activate(dev); return ret; } static void dev_init_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc) { struct Qdisc *qdisc = _qdisc; rcu_assign_pointer(dev_queue->qdisc, qdisc); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } void dev_init_scheduler(struct net_device *dev) { rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); qdisc_put(rtnl_dereference(dev->qdisc)); rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } /** * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division * @rate: Rate to compute reciprocal division values of * @mult: Multiplier for reciprocal division * @shift: Shift for reciprocal division * * The multiplier and shift for reciprocal division by rate are stored * in mult and shift. * * The deal here is to replace a divide by a reciprocal one * in fast path (a reciprocal divide is a multiply and a shift) * * Normal formula would be : * time_in_ns = (NSEC_PER_SEC * len) / rate_bps * * We compute mult/shift to use instead : * time_in_ns = (len * mult) >> shift; * * We try to get the highest possible mult value for accuracy, * but have to make sure no overflows will ever happen. * * reciprocal_value() is not used here it doesn't handle 64-bit values. */ static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) { u64 factor = NSEC_PER_SEC; *mult = 1; *shift = 0; if (rate <= 0) return; for (;;) { *mult = div64_u64(factor, rate); if (*mult & (1U << 31) || factor & (1ULL << 63)) break; factor <<= 1; (*shift)++; } } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->mpu = conf->mpu; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) { r->rate_pkts_ps = pktrate64; psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ppscfg_precompute); void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { /* Protected with chain0->filter_chain_lock. * Can't access chain directly because tp_head can be NULL. */ struct mini_Qdisc *miniq_old = rcu_dereference_protected(*miniqp->p_miniq, 1); struct mini_Qdisc *miniq; if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); } else { miniq = miniq_old != &miniqp->miniq1 ? &miniqp->miniq1 : &miniqp->miniq2; /* We need to make sure that readers won't see the miniq * we are about to modify. So ensure that at least one RCU * grace period has elapsed since the miniq was made * inactive. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) cond_synchronize_rcu(miniq->rcu_state); else if (!poll_state_synchronize_rcu(miniq->rcu_state)) synchronize_rcu_expedited(); miniq->filter_list = tp_head; rcu_assign_pointer(*miniqp->p_miniq, miniq); } if (miniq_old) /* This is counterpart of the rcu sync above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ miniq_old->rcu_state = start_poll_synchronize_rcu(); } EXPORT_SYMBOL(mini_qdisc_pair_swap); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block) { miniqp->miniq1.block = block; miniqp->miniq2.block = block; } EXPORT_SYMBOL(mini_qdisc_pair_block_init); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq) { miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; miniqp->p_miniq = p_miniq; } EXPORT_SYMBOL(mini_qdisc_pair_init);
19 19 18 18 9 9 18 18 21 21 21 21 21 10 1 4 5 2 1 2 2 2 9 1 1 10 10 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 // SPDX-License-Identifier: GPL-2.0-only /* * VHT handling * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2024 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "rate.h" static void __check_vhtcap_disable(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap, u32 flag) { __le32 le_flag = cpu_to_le32(flag); if (sdata->u.mgd.vht_capa_mask.vht_cap_info & le_flag && !(sdata->u.mgd.vht_capa.vht_cap_info & le_flag)) vht_cap->cap &= ~flag; } void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap) { int i; u16 rxmcs_mask, rxmcs_cap, rxmcs_n, txmcs_mask, txmcs_cap, txmcs_n; if (!vht_cap->vht_supported) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RXLDPC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_80); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_160); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TXSTBC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN); /* Allow user to decrease AMPDU length exponent */ if (sdata->u.mgd.vht_capa_mask.vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK)) { u32 cap, n; n = le32_to_cpu(sdata->u.mgd.vht_capa.vht_cap_info) & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; n >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; cap = vht_cap->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; cap >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; if (n < cap) { vht_cap->cap &= ~IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; vht_cap->cap |= n << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; } } /* Allow the user to decrease MCSes */ rxmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.rx_mcs_map); rxmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.rx_mcs_map); rxmcs_n &= rxmcs_mask; rxmcs_cap = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); txmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.tx_mcs_map); txmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.tx_mcs_map); txmcs_n &= txmcs_mask; txmcs_cap = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); for (i = 0; i < 8; i++) { u8 m, n, c; m = (rxmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (rxmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (rxmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { rxmcs_cap &= ~(3 << 2*i); rxmcs_cap |= (rxmcs_n & (3 << 2*i)); } m = (txmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (txmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (txmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { txmcs_cap &= ~(3 << 2*i); txmcs_cap |= (txmcs_n & (3 << 2*i)); } } vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(rxmcs_cap); vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(txmcs_cap); } void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, const struct ieee80211_vht_cap *vht_cap_ie2, struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; bool have_80mhz; u32 mpdu_len; memset(vht_cap, 0, sizeof(*vht_cap)); if (!link_sta->pub->ht_cap.ht_supported) return; if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (!have_80mhz) return; /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 * and others based on the BCM4360 chipset) will unset this * capability bit when operating in 20 MHz. */ vht_cap->vht_supported = true; own_cap = sband->vht_cap; /* * If user has specified capability overrides, take care * of that if the station we're setting up is the AP that * we advertised a restricted capability set to. Override * our own capabilities and then use those below. */ if (sdata->vif.type == NL80211_IFTYPE_STATION && !test_sta_flag(link_sta->sta, WLAN_STA_TDLS_PEER)) ieee80211_apply_vhtcap_overrides(sdata, &own_cap); /* take some capabilities as-is */ cap_info = le32_to_cpu(vht_cap_ie->vht_cap_info); vht_cap->cap = cap_info; vht_cap->cap &= IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_VHT_TXOP_PS | IEEE80211_VHT_CAP_HTC_VHT | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB | IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN | IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN; vht_cap->cap |= min_t(u32, cap_info & IEEE80211_VHT_CAP_MAX_MPDU_MASK, own_cap.cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK); /* and some based on our own capabilities */ switch (own_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; break; default: /* nothing */ break; } /* symmetric capabilities */ vht_cap->cap |= cap_info & own_cap.cap & (IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160); /* remaining ones */ if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE | IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_TXSTBC) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_RXSTBC_MASK; if (own_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_TXSTBC; /* Copy peer MCS info, the driver might need them. */ memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs, sizeof(struct ieee80211_vht_mcs_info)); /* copy EXT_NSS_BW Support value or remove the capability */ if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_VHT_EXT_NSS_BW)) vht_cap->cap |= (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); else vht_cap->vht_mcs.tx_highest &= ~cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE); /* but also restrict MCSes */ for (i = 0; i < 8; i++) { u16 own_rx, own_tx, peer_rx, peer_tx; own_rx = le16_to_cpu(own_cap.vht_mcs.rx_mcs_map); own_rx = (own_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; own_tx = le16_to_cpu(own_cap.vht_mcs.tx_mcs_map); own_tx = (own_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_rx = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); peer_rx = (peer_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_tx = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); peer_tx = (peer_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (peer_tx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_rx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_tx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_rx < peer_tx) peer_tx = own_rx; } if (peer_rx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_tx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_rx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_tx < peer_rx) peer_rx = own_tx; } vht_cap->vht_mcs.rx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.rx_mcs_map |= cpu_to_le16(peer_rx << i * 2); vht_cap->vht_mcs.tx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2); } /* * This is a workaround for VHT-enabled STAs which break the spec * and have the VHT-MCS Rx map filled in with value 3 for all eight * spacial streams, an example is AR9462. * * As per spec, in section 22.1.1 Introduction to the VHT PHY * A VHT STA shall support at least single spactial stream VHT-MCSs * 0 to 7 (transmit and receive) in all supported channel widths. */ if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) { vht_cap->vht_supported = false; sdata_info(sdata, "Ignoring VHT IE from %pM (link:%pM) due to invalid rx_mcs_map\n", link_sta->sta->addr, link_sta->addr); return; } /* finally set up the bandwidth */ switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; default: link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; if (!(vht_cap->vht_mcs.tx_highest & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) break; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; } link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta); /* * Work around the Cisco 9115 FW 17.3 bug by taking the min of * both reported MPDU lengths. */ mpdu_len = vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK; if (vht_cap_ie2) mpdu_len = min_t(u32, mpdu_len, le32_get_bits(vht_cap_ie2->vht_cap_info, IEEE80211_VHT_CAP_MAX_MPDU_MASK)); /* * FIXME - should the amsdu len be per link? store per link * and maintain a minimum? */ switch (mpdu_len) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: default: link_sta->pub->agg.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; break; } ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } /* FIXME: move this to some better location - parses HE/EHT now */ enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef) { unsigned int link_id = link_sta->link_id; struct ieee80211_sub_if_data *sdata = link_sta->sta->sdata; struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap; struct ieee80211_sta_eht_cap *eht_cap = &link_sta->pub->eht_cap; u32 cap_width; if (he_cap->has_he) { enum nl80211_band band; u8 info; if (chandef) { band = chandef->chan->band; } else { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(sdata->vif.link_conf[link_id]); band = link_conf->chanreq.oper.chan->band; rcu_read_unlock(); } if (eht_cap->has_eht && band == NL80211_BAND_6GHZ) { info = eht_cap->eht_cap_elem.phy_cap_info[0]; if (info & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) return IEEE80211_STA_RX_BW_320; } info = he_cap->he_cap_elem.phy_cap_info[0]; if (band == NL80211_BAND_2GHZ) { if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) return IEEE80211_STA_RX_BW_40; return IEEE80211_STA_RX_BW_20; } if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G || info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) return IEEE80211_STA_RX_BW_160; if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) return IEEE80211_STA_RX_BW_80; return IEEE80211_STA_RX_BW_20; } if (!vht_cap->vht_supported) return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ || cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return IEEE80211_STA_RX_BW_160; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) return IEEE80211_STA_RX_BW_160; return IEEE80211_STA_RX_BW_80; } enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta) { struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; u32 cap_width; if (!vht_cap->vht_supported) { if (!link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; return link_sta->pub->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20; } cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; else if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return NL80211_CHAN_WIDTH_80P80; return NL80211_CHAN_WIDTH_80; } enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *link_sta) { enum ieee80211_sta_rx_bandwidth cur_bw = link_sta->pub->bandwidth; struct ieee80211_sta_vht_cap *vht_cap = &link_sta->pub->vht_cap; u32 cap_width; switch (cur_bw) { case IEEE80211_STA_RX_BW_20: if (!link_sta->pub->ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; else return NL80211_CHAN_WIDTH_20; case IEEE80211_STA_RX_BW_40: return NL80211_CHAN_WIDTH_40; case IEEE80211_STA_RX_BW_80: return NL80211_CHAN_WIDTH_80; case IEEE80211_STA_RX_BW_160: cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; return NL80211_CHAN_WIDTH_80P80; default: return NL80211_CHAN_WIDTH_20; } } enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { switch (width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_STA_RX_BW_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_STA_RX_BW_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_STA_RX_BW_80; case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: return IEEE80211_STA_RX_BW_160; case NL80211_CHAN_WIDTH_320: return IEEE80211_STA_RX_BW_320; default: WARN_ON_ONCE(1); return IEEE80211_STA_RX_BW_20; } } /* FIXME: rename/move - this deals with everything not just VHT */ enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, struct cfg80211_chan_def *chandef) { struct sta_info *sta = link_sta->sta; enum nl80211_chan_width bss_width; enum ieee80211_sta_rx_bandwidth bw; if (chandef) { bss_width = chandef->width; } else { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(sta->sdata->vif.link_conf[link_sta->link_id]); if (WARN_ON_ONCE(!link_conf)) { rcu_read_unlock(); return IEEE80211_STA_RX_BW_20; } bss_width = link_conf->chanreq.oper.width; rcu_read_unlock(); } bw = _ieee80211_sta_cap_rx_bw(link_sta, chandef); bw = min(bw, link_sta->cur_max_bandwidth); /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of * IEEE80211-2016 specification makes higher bandwidth operation * possible on the TDLS link if the peers have wider bandwidth * capability. * * However, in this case, and only if the TDLS peer is authorized, * limit to the tdls_chandef so that the configuration here isn't * wider than what's actually requested on the channel context. */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) && test_sta_flag(sta, WLAN_STA_AUTHORIZED) && sta->tdls_chandef.chan) bw = min(bw, ieee80211_chan_width_to_rx_bw(sta->tdls_chandef.width)); else bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; } void ieee80211_sta_init_nss(struct link_sta_info *link_sta) { u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, eht_rx_nss = 0, rx_nss; bool support_160; if (link_sta->pub->eht_cap.has_eht) { int i; const u8 *rx_nss_mcs = (void *)&link_sta->pub->eht_cap.eht_mcs_nss_supp; /* get the max nss for EHT over all possible bandwidths and mcs */ for (i = 0; i < sizeof(struct ieee80211_eht_mcs_nss_supp); i++) eht_rx_nss = max_t(u8, eht_rx_nss, u8_get_bits(rx_nss_mcs[i], IEEE80211_EHT_MCS_NSS_RX)); } if (link_sta->pub->he_cap.has_he) { int i; u8 rx_mcs_80 = 0, rx_mcs_160 = 0; const struct ieee80211_sta_he_cap *he_cap = &link_sta->pub->he_cap; u16 mcs_160_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160); u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80); for (i = 7; i >= 0; i--) { u8 mcs_160 = (mcs_160_map >> (2 * i)) & 3; if (mcs_160 != IEEE80211_HE_MCS_NOT_SUPPORTED) { rx_mcs_160 = i + 1; break; } } for (i = 7; i >= 0; i--) { u8 mcs_80 = (mcs_80_map >> (2 * i)) & 3; if (mcs_80 != IEEE80211_HE_MCS_NOT_SUPPORTED) { rx_mcs_80 = i + 1; break; } } support_160 = he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (support_160) he_rx_nss = min(rx_mcs_80, rx_mcs_160); else he_rx_nss = rx_mcs_80; } if (link_sta->pub->ht_cap.ht_supported) { if (link_sta->pub->ht_cap.mcs.rx_mask[0]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[1]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[2]) ht_rx_nss++; if (link_sta->pub->ht_cap.mcs.rx_mask[3]) ht_rx_nss++; /* FIXME: consider rx_highest? */ } if (link_sta->pub->vht_cap.vht_supported) { int i; u16 rx_mcs_map; rx_mcs_map = le16_to_cpu(link_sta->pub->vht_cap.vht_mcs.rx_mcs_map); for (i = 7; i >= 0; i--) { u8 mcs = (rx_mcs_map >> (2 * i)) & 3; if (mcs != IEEE80211_VHT_MCS_NOT_SUPPORTED) { vht_rx_nss = i + 1; break; } } /* FIXME: consider rx_highest? */ } rx_nss = max(vht_rx_nss, ht_rx_nss); rx_nss = max(he_rx_nss, rx_nss); rx_nss = max(eht_rx_nss, rx_nss); rx_nss = max_t(u8, 1, rx_nss); link_sta->capa_nss = rx_nss; /* that shouldn't be set yet, but we can handle it anyway */ if (link_sta->op_mode_nss) link_sta->pub->rx_nss = min_t(u8, rx_nss, link_sta->op_mode_nss); else link_sta->pub->rx_nss = rx_nss; } u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta, u8 opmode, enum nl80211_band band) { enum ieee80211_sta_rx_bandwidth new_bw; struct sta_opmode_info sta_opmode = {}; u32 changed = 0; u8 nss; /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) return 0; nss = opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; if (link_sta->op_mode_nss != nss) { if (nss <= link_sta->capa_nss) { link_sta->op_mode_nss = nss; if (nss != link_sta->pub->rx_nss) { link_sta->pub->rx_nss = nss; changed |= IEEE80211_RC_NSS_CHANGED; sta_opmode.rx_nss = link_sta->pub->rx_nss; sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; } } else { pr_warn_ratelimited("Ignoring NSS change in VHT Operating Mode Notification from %pM with invalid nss %d", link_sta->pub->addr, nss); } } switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; else link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: /* legacy only, no longer used by newer spec */ link_sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } new_bw = ieee80211_sta_cur_vht_bw(link_sta); if (new_bw != link_sta->pub->bandwidth) { link_sta->pub->bandwidth = new_bw; sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(link_sta); changed |= IEEE80211_RC_BW_CHANGED; sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; } if (sta_opmode.changed) cfg80211_sta_opmode_change_notify(sdata->dev, link_sta->addr, &sta_opmode, GFP_KERNEL); return changed; } void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt) { struct ieee80211_bss_conf *link_conf = link->conf; if (!link_conf->mu_mimo_owner) return; if (!memcmp(mgmt->u.action.u.vht_group_notif.position, link_conf->mu_group.position, WLAN_USER_POSITION_LEN) && !memcmp(mgmt->u.action.u.vht_group_notif.membership, link_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) return; memcpy(link_conf->mu_group.membership, mgmt->u.action.u.vht_group_notif.membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, mgmt->u.action.u.vht_group_notif.position, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_MU_GROUPS); } void ieee80211_update_mu_groups(struct ieee80211_vif *vif, unsigned int link_id, const u8 *membership, const u8 *position) { struct ieee80211_bss_conf *link_conf; rcu_read_lock(); link_conf = rcu_dereference(vif->link_conf[link_id]); if (!WARN_ON_ONCE(!link_conf || !link_conf->mu_mimo_owner)) { memcpy(link_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, position, WLAN_USER_POSITION_LEN); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta, u8 opmode, enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; u32 changed = __ieee80211_vht_handle_opmode(sdata, link_sta, opmode, band); if (changed > 0) { ieee80211_recalc_min_chandef(sdata, link_sta->link_id); rate_control_rate_update(local, sband, link_sta->sta, link_sta->link_id, changed); } } void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]) { int i; u16 mask, cap = le16_to_cpu(vht_cap); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { mask = (cap >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; switch (mask) { case IEEE80211_VHT_MCS_SUPPORT_0_7: vht_mask[i] = 0x00FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_8: vht_mask[i] = 0x01FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_9: vht_mask[i] = 0x03FF; break; case IEEE80211_VHT_MCS_NOT_SUPPORTED: default: vht_mask[i] = 0; break; } } }
1 11 11 148 124 222 11 67 2 609 31 609 19619 1421 38 625 607 609 16 429 10 10 10 291 291 291 772 753 20 752 20 20 563 22 25 26 483 988 34 49 23292 62 96 96 51 5 96 96 1061 1064 1063 49 129 788 385 91 43 40 415 149 54 295 188 137 276 19 5 2 32 404 362 478 908 98 908 1055 379 389 17806 354 12862 909 24 223 20444 5 107 203 18152 207 498 125 2 9 125 508 39 13 11 1 51 2 263 3 2 4 4 4 290 429 2090 143 430 567 288 288 283 186 359 505 4037 944 9 2030 2028 31 31 18 18 18 3418 55 423 50 569 43 199 464 25 18050 83 2 947 251 1803 15933 305 9 1246 5 4 107 21 99 225 66 24 132 300 1179 42 11 12 1235 19 849 4 434 18 1101 5 5 23 14131 14143 24 20 91 82 13 257 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the AF_INET socket handler. * * Version: @(#)sock.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche <flla@stud.uni-sb.de> * * Fixes: * Alan Cox : Volatiles in skbuff pointers. See * skbuff comments. May be overdone, * better to prove they can be removed * than the reverse. * Alan Cox : Added a zapped field for tcp to note * a socket is reset and must stay shut up * Alan Cox : New fields for options * Pauline Middelink : identd support * Alan Cox : Eliminate low level recv/recvfrom * David S. Miller : New socket lookup architecture. * Steve Whitehouse: Default routines for sock_ops * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made * protinfo be just a void pointer, as the * protocol specific parts were moved to * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options */ #ifndef _SOCK_H #define _SOCK_H #include <linux/hardirq.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/timer.h> #include <linux/cache.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* struct sk_buff */ #include <linux/mm.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/static_key.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cgroup-defs.h> #include <linux/rbtree.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> #include <linux/sockptr.h> #include <linux/indirect_call_wrapper.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/llist.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> #include <net/l3mdev.h> #include <uapi/linux/socket.h> /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of * the other protocols. */ /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { spinlock_t slock; int owned; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics * to the lock validator by explicitly managing * the slock as a lock variant (in addition to * the slock itself): */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } socket_lock_t; struct sock; struct proto; struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_listener: connection request listener socket (aka rsk_listener) * [union with @skc_flags] * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) * [union with @skc_incoming_cpu] * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { union { __addrpair skc_addrpair; struct { __be32 skc_daddr; __be32 skc_rcv_saddr; }; }; union { unsigned int skc_hash; __u16 skc_u16hashes[2]; }; /* skc_dport && skc_num must be grouped as well */ union { __portpair skc_portpair; struct { __be16 skc_dport; __u16 skc_num; }; }; unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr skc_v6_daddr; struct in6_addr skc_v6_rcv_saddr; #endif atomic64_t skc_cookie; /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */ union { unsigned long skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ }; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() */ /* private: */ int skc_dontcopy_begin[0]; /* public: */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING unsigned short skc_rx_queue_mapping; #endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ }; refcount_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; union { u32 skc_rxhash; u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; /* public: */ }; struct bpf_local_storage; struct sk_filter; /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_rx_dst_ifindex: ifindex for @sk_rx_dst * @sk_rx_dst_cookie: cookie for @sk_rx_dst * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags * @sk_write_queue: Packet sending queue * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_reserved_mem: space reserved and non-reclaimable for the socket * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden. * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_disconnects: number of disconnect operations performed on this sock * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_portpair __sk_common.skc_portpair #define sk_num __sk_common.skc_num #define sk_dport __sk_common.skc_dport #define sk_addrpair __sk_common.skc_addrpair #define sk_daddr __sk_common.skc_daddr #define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash __cacheline_group_begin(sock_write_rx); atomic_t sk_drops; __s32 sk_peek_off; struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */ struct { atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc __cacheline_group_end(sock_write_rx); __cacheline_group_begin(sock_read_rx); /* early demux fields */ struct dst_entry __rcu *sk_rx_dst; int sk_rx_dst_ifindex; u32 sk_rx_dst_cookie; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; unsigned int sk_napi_id; u16 sk_busy_poll_budget; u8 sk_prefer_busy_poll; #endif u8 sk_userlocks; int sk_rcvbuf; struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; /* private: */ struct socket_wq *sk_wq_raw; /* public: */ }; void (*sk_data_ready)(struct sock *sk); long sk_rcvtimeo; int sk_rcvlowat; __cacheline_group_end(sock_read_rx); __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; struct mem_cgroup *sk_memcg; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); socket_lock_t sk_lock; u32 sk_reserved_mem; int sk_forward_alloc; u32 sk_tsflags; __cacheline_group_end(sock_write_rxtx); __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; int sk_sndbuf; int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; union { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ struct page_frag sk_frag; struct timer_list sk_timer; unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); unsigned long sk_max_pacing_rate; long sk_sndtimeo; u32 sk_priority; u32 sk_mark; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, struct net_device *dev, struct sk_buff *skb); #endif u16 sk_gso_type; u16 sk_gso_max_segs; unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); /* * Because of non atomicity rules, all * changes are protected by socket lock. */ u8 sk_gso_disabled : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; u16 sk_type; u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif int sk_disconnects; u8 sk_txrehash; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; void (*sk_state_change)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; }; struct sock_bh_locked { struct sock *sock; local_lock_t bh_lock; }; enum sk_pacing { SK_PACING_NONE = 0, SK_PACING_NEEDED = 1, SK_PACING_FQ = 2, }; /* flag bits in sk_user_data * * - SK_USER_DATA_NOCOPY: Pointer stored in sk_user_data might * not be suitable for copying when cloning the socket. For instance, * it can point to a reference counted object. sk_user_data bottom * bit is set if pointer must not be copied. * * - SK_USER_DATA_BPF: Mark whether sk_user_data field is * managed/owned by a BPF reuseport array. This bit should be set * when sk_user_data's sk is added to the bpf's reuseport_array. * * - SK_USER_DATA_PSOCK: Mark whether pointer stored in * sk_user_data points to psock type. This bit should be set * when sk_user_data is assigned to a psock object. */ #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL #define SK_USER_DATA_PSOCK 4UL #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\ SK_USER_DATA_PSOCK) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied * @sk: socket */ static inline bool sk_user_data_is_nocopy(const struct sock *sk) { return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); } #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) /** * __locked_read_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits * * The caller must be holding sk->sk_callback_lock. */ static inline void * __locked_read_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference_check(__sk_user_data(sk), lockdep_is_held(&sk->sk_callback_lock)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } /** * __rcu_dereference_sk_user_data_with_flags - return the pointer * only if argument flags all has been set in sk_user_data. Otherwise * return NULL * * @sk: socket * @flags: flag bits */ static inline void * __rcu_dereference_sk_user_data_with_flags(const struct sock *sk, uintptr_t flags) { uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk)); WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK); if ((sk_user_data & flags) == flags) return (void *)(sk_user_data & SK_USER_DATA_PTRMASK); return NULL; } #define rcu_dereference_sk_user_data(sk) \ __rcu_dereference_sk_user_data_with_flags(sk, 0) #define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags) \ ({ \ uintptr_t __tmp1 = (uintptr_t)(ptr), \ __tmp2 = (uintptr_t)(flags); \ WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK); \ WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ __tmp1 | __tmp2); \ }) #define rcu_assign_sk_user_data(sk, ptr) \ __rcu_assign_sk_user_data_with_flags(sk, ptr, 0) static inline struct net *sock_net(const struct sock *sk) { return read_pnet(&sk->sk_net); } static inline void sock_net_set(struct sock *sk, struct net *net) { write_pnet(&sk->sk_net, net); } /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE * on a socket means that the socket will reuse everybody else's port * without looking at the other's sk_reuse value. */ #define SK_NO_REUSE 0 #define SK_CAN_REUSE 1 #define SK_FORCE_REUSE 2 int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(const struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { return READ_ONCE(sk->sk_peek_off); } return 0; } static inline void sk_peek_offset_bwd(struct sock *sk, int val) { s32 off = READ_ONCE(sk->sk_peek_off); if (unlikely(off >= 0)) { off = max_t(s32, off - val, 0); WRITE_ONCE(sk->sk_peek_off, off); } } static inline void sk_peek_offset_fwd(struct sock *sk, int val) { sk_peek_offset_bwd(sk, -val); } /* * Hashed lists helper routines */ static inline struct sock *sk_entry(const struct hlist_node *node) { return hlist_entry(node, struct sock, sk_node); } static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); } static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); } static inline struct sock *sk_next(const struct sock *sk) { return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); } static inline struct sock *sk_nulls_next(const struct sock *sk) { return (!is_a_nulls(sk->sk_nulls_node.next)) ? hlist_nulls_entry(sk->sk_nulls_node.next, struct sock, sk_nulls_node) : NULL; } static inline bool sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } static inline bool sk_hashed(const struct sock *sk) { return !sk_unhashed(sk); } static inline void sk_node_init(struct hlist_node *node) { node->pprev = NULL; } static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); } /* NB: equivalent to hlist_del_init_rcu */ static inline bool __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); sk_node_init(&sk->sk_node); return true; } return false; } /* Grab socket reference count. This operation is valid only when sk is ALREADY grabbed f.e. it is found in hash table or a list and the lookup is made under lock preventing hash table modifications. */ static __always_inline void sock_hold(struct sock *sk) { refcount_inc(&sk->sk_refcnt); } /* Ungrab socket in the context, which assumes that socket refcnt cannot hit zero, f.e. it is true in context of any socketcall. */ static __always_inline void __sock_put(struct sock *sk) { refcount_dec(&sk->sk_refcnt); } static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return true; } return false; } static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); } static inline void sk_add_node(struct sock *sk, struct hlist_head *list) { sock_hold(sk); __sk_add_node(sk, list); } static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&sk->sk_node, list); else hlist_add_head_rcu(&sk->sk_node, list); } static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); hlist_add_tail_rcu(&sk->sk_node, list); } static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); __sk_nulls_add_node_rcu(sk, list); } static inline void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); } static inline void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_bind_node, list); } #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, sk_node) #define sk_nulls_for_each(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) #define sk_nulls_for_each_rcu(__sk, node, list) \ hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk) \ hlist_for_each_entry_from(__sk, sk_node) #define sk_nulls_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(const struct sock *sk) { /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from * userspace. */ return sk->sk_socket->file->f_cred->user_ns; } /* Sock flags */ enum sock_flags { SOCK_DEAD, SOCK_DONE, SOCK_URGINLINE, SOCK_KEEPOPEN, SOCK_LINGER, SOCK_DESTROY, SOCK_BROADCAST, SOCK_TIMESTAMP, SOCK_ZAPPED, SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ SOCK_RXQ_OVFL, SOCK_ZEROCOPY, /* buffers from userspace */ SOCK_WIFI_STATUS, /* push wifi status to userspace */ SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. * Will use last 4 bytes of packet sent from * user-space instead. */ SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { nsk->sk_flags = osk->sk_flags; } static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); } static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) { __clear_bit(flag, &sk->sk_flags); } static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, int valbool) { if (valbool) sock_set_flag(sk, bit); else sock_reset_flag(sk, bit); } static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) { return test_bit(flag, &sk->sk_flags); } #ifdef CONFIG_NET DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } void __receive_sock(struct file *file); #else static inline int sk_memalloc_socks(void) { return 0; } static inline void __receive_sock(struct file *file) { } #endif static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } /* Note: If you think the test should be: * return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog); * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.") */ static inline bool sk_acceptq_is_full(const struct sock *sk) { return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } static inline void sk_wmem_queued_add(struct sock *sk, int val) { WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } static inline void sk_forward_alloc_add(struct sock *sk, int val) { /* Paired with lockless reads of sk->sk_forward_alloc */ WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val); } void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ skb_dst_force(skb); if (!sk->sk_backlog.tail) WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } /* * Take into account size of receive queue and backlog queue * Do not take into account this skb truesize, * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); return qsize > limit; } /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, unsigned int limit) { if (sk_rcvqueues_full(sk, limit)) return -ENOBUFS; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; __sk_add_backlog(sk, skb); sk->sk_backlog.len += skb->truesize; return 0; } int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)); static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); return INDIRECT_CALL_INET(sk->sk_backlog_rcv, tcp_v6_do_rcv, tcp_v4_do_rcv, sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) { int cpu = raw_smp_processor_id(); if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu)) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_RPS /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in sock_rps_record_flow(). */ if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash)) WRITE_ONCE(sk->sk_rxhash, skb->hash); #endif } static inline void sock_rps_reset_rxhash(struct sock *sk) { #ifdef CONFIG_RPS /* Paired with READ_ONCE() in sock_rps_record_flow() */ WRITE_ONCE(sk->sk_rxhash, 0); #endif } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc, __dis = __sk->sk_disconnects; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ *(__timeo) = wait_woken(__wait, \ TASK_INTERRUPTIBLE, \ *(__timeo)); \ } \ sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \ __rc; \ }) int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); int sk_stream_error(struct sock *sk, int flags, int err); void sk_stream_kill_queues(struct sock *sk); void sk_set_memalloc(struct sock *sk); void sk_clear_memalloc(struct sock *sk); void __sk_flush_backlog(struct sock *sk); static inline bool sk_flush_backlog(struct sock *sk) { if (unlikely(READ_ONCE(sk->sk_backlog.tail))) { __sk_flush_backlog(sk); return true; } return false; } int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) { if (offsetof(struct sock, sk_node.next) != 0) memset(sk, 0, offsetof(struct sock, sk_node.next)); memset(&sk->sk_node.pprev, 0, size - offsetof(struct sock, sk_node.pprev)); } struct proto_accept_arg { int flags; int err; int is_empty; bool kern; }; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); bool (*bpf_bypass_getsockopt)(int level, int optname); void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); void (*put_port)(struct sock *sk); #ifdef CONFIG_BPF_SYSCALL int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); #endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsigned int inuse_idx; #endif #if IS_ENABLED(CONFIG_MPTCP) int (*forward_alloc_get)(const struct sock *sk); #endif bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ int __percpu *per_cpu_fw_alloc; struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; u32 sysctl_wmem_offset; u32 sysctl_rmem_offset; int max_header; bool no_autobind; struct kmem_cache *slab; unsigned int obj_size; unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ unsigned int __percpu *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; struct module *owner; char name[32]; struct list_head node; int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); static inline int sk_forward_alloc_get(const struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP) if (sk->sk_prot->forward_alloc_get) return sk->sk_prot->forward_alloc_get(sk); #endif return READ_ONCE(sk->sk_forward_alloc); } static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free, tcp_stream_memory_free, sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) { return __sk_stream_memory_free(sk, 0); } static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline bool sk_stream_is_writeable(const struct sock *sk) { return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, struct cgroup *ancestor) { #ifdef CONFIG_SOCK_CGROUP_DATA return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), ancestor); #else return -ENOTSUPP; #endif } #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline void sk_sockets_allocated_inc(struct sock *sk) { percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1, SK_ALLOC_PERCPU_COUNTER_BATCH); } static inline u64 sk_sockets_allocated_read_positive(struct sock *sk) { return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); } static inline int proto_sockets_allocated_sum_positive(struct proto *prot) { return percpu_counter_sum_positive(prot->sockets_allocated); } #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { int all; int val[PROTO_INUSE_NR]; }; static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } static inline void sock_inuse_add(const struct net *net, int val) { this_cpu_add(net->core.prot_inuse->all, val); } int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(const struct net *net, const struct proto *prot, int val) { } static inline void sock_inuse_add(const struct net *net, int val) { } #endif /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); return sk->sk_prot->hash(sk); } /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) /* Sockets 0-1023 can't be bound to unless you are superuser */ #define PROT_SOCK 1024 #define SHUTDOWN_MASK 3 #define RCV_SHUTDOWN 1 #define SEND_SHUTDOWN 2 #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 struct socket_alloc { struct socket socket; struct inode vfs_inode; }; static inline struct socket *SOCKET_I(struct inode *inode) { return &container_of(inode, struct socket_alloc, vfs_inode)->socket; } static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } /* * Functions for memory accounting */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 /* sysctl_mem values are in pages */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { return READ_ONCE(sk->sk_prot->sysctl_mem[index]); } static inline int sk_mem_pages(int amt) { return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT; } static inline bool sk_has_account(struct sock *sk) { /* return true if protocol supports memory accounting */ return !!sk->sk_prot->memory_allocated; } static inline bool sk_wmem_schedule(struct sock *sk, int size) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND); } static inline bool sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { int delta; if (!sk_has_account(sk)) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || skb_pfmemalloc(skb); } static inline int sk_unused_reserved_mem(const struct sock *sk) { int unused_mem; if (likely(!sk->sk_reserved_mem)) return 0; unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued - atomic_read(&sk->sk_rmem_alloc); return unused_mem > 0 ? unused_mem : 0; } static inline void sk_mem_reclaim(struct sock *sk) { int reclaimable; if (!sk_has_account(sk)) return; reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk); if (reclaimable >= (int)PAGE_SIZE) __sk_mem_reclaim(sk, reclaimable); } static inline void sk_mem_reclaim_final(struct sock *sk) { sk->sk_reserved_mem = 0; sk_mem_reclaim(sk); } static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, -size); } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk_forward_alloc_add(sk, size); sk_mem_reclaim(sk); } /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. * * Mark both the sk_lock and the sk_lock.slock as a * per-address-family lock class. */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) static inline bool lockdep_sock_is_held(const struct sock *sk) { return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } void lock_sock_nested(struct sock *sk, int subclass); static inline void lock_sock(struct sock *sk) { lock_sock_nested(sk, 0); } void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_lock_sock_nested(__sk) \ spin_lock_nested(&((__sk)->sk_lock.slock), \ SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock); /** * lock_sock_fast - fast version of lock_sock * @sk: socket * * This version should be used for very small section, where process wont block * return false if fast path is taken: * * sk_lock.slock locked, owned = 0, BH disabled * * return true if slow path is taken: * * sk_lock.slock unlocked, owned = 1, BH enabled */ static inline bool lock_sock_fast(struct sock *sk) { /* The sk_lock has mutex_lock() semantics here. */ mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); return __lock_sock_fast(sk); } /* fast socket lock variant for caller already holding a [different] socket lock */ static inline bool lock_sock_fast_nested(struct sock *sk) { mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_); return __lock_sock_fast(sk); } /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket * @slow: slow mode * * fast unlock socket for user context. * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) __releases(&sk->sk_lock.slock) { if (slow) { release_sock(sk); __release(&sk->sk_lock.slock); } else { mutex_release(&sk->sk_lock.dep_map, _RET_IP_); spin_unlock_bh(&sk->sk_lock.slock); } } void sockopt_lock_sock(struct sock *sk); void sockopt_release_sock(struct sock *sk); bool sockopt_ns_capable(struct user_namespace *ns, int cap); bool sockopt_capable(int cap); /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming * packets, so that we won't get any new data or any * packets that change the state of the socket. * * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. * * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ static inline void sock_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks); #endif } static inline void sock_not_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); #endif } static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); return sk->sk_lock.owned; } static inline bool sock_owned_by_user_nocheck(const struct sock *sk) { return sk->sk_lock.owned; } static inline void sock_release_ownership(struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk)); sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { struct sock *sk = (struct sock *)csk; return !sock_owned_by_user_nocheck(sk) && !spin_is_locked(&sk->sk_lock.slock); } struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); void __sock_wfree(struct sk_buff *skb); void sock_wfree(struct sk_buff *skb); struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); #else #define sock_edemux sock_efree #endif int sk_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); int do_sock_setsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, int optlen); int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen); int sk_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) { return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); static inline void sock_replace_proto(struct sock *sk, struct proto *proto) { if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); WRITE_ONCE(sk->sk_prot, proto); } struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags) }; } int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); /* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); /* * Functions to fill in entries in struct proto_ops when a protocol * uses the inet style. */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); /* * Default socket callbacks and setup code */ /* Initialise core socket variables using an explicit uid. */ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid); /* Initialise core socket variables. * Assumes struct socket *sock is embedded in a struct socket_alloc. */ void sock_init_data(struct socket *sock, struct sock *sk); /* * Socket reference counting postulates. * * * Each user of socket SHOULD hold a reference count. * * Each access point to socket (an hash table bucket, reference from a list, * running timer, skb in flight MUST hold a reference count. * * When reference count hits 0, it means it will never increase back. * * When reference count hits 0, it means that no references from * outside exist to this socket and current process on current CPU * is last user and may/should destroy this socket. * * sk_free is called from any context: process, BH, IRQ. When * it is called, socket has no references from outside -> sk_free * may release descendant resources allocated by the socket, but * to the time when it is called, socket is NOT referenced by any * hash tables, lists etc. * * Packets, delivered from outside (from network or from another process) * and enqueued on receive/error queues SHOULD NOT grab reference count, * when they sit in queue. Otherwise, packets will leak to hole, when * socket is looked up by one cpu and unhasing is made by another CPU. * It is true for udp/raw, netlink (leak to receive and error queues), tcp * (leak to backlog). Packet socket does all the processing inside * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets * use separate SMP lock, so that they are prone too. */ /* Ungrab socket and destroy it, if it was the last reference. */ static inline void sock_put(struct sock *sk) { if (refcount_dec_and_test(&sk->sk_refcnt)) sk_free(sk); } /* Generic version of sock_put(), dealing with all sockets * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...) */ void sock_gen_put(struct sock *sk); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted); static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) { return __sk_receive_skb(sk, skb, nested, 1, true); } static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } static inline int sk_tx_queue_get(const struct sock *sk) { if (sk) { /* Paired with WRITE_ONCE() in sk_tx_queue_clear() * and sk_tx_queue_set(). */ int val = READ_ONCE(sk->sk_tx_queue_mapping); if (val != NO_QUEUE_MAPPING) return val; } return -1; } static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, bool force_set) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); if (force_set || unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, true); } static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) { __sk_rx_queue_set(sk, skb, false); } static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING); #endif } static inline int sk_rx_queue_get(const struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (sk) { int res = READ_ONCE(sk->sk_rx_queue_mapping); if (res != NO_QUEUE_MAPPING) return res; } #endif return -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; } static inline wait_queue_head_t *sk_sleep(struct sock *sk) { BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, * we do not release it in this function, because protocol * probably wants some additional cleanups or even continuing * to work with this socket (TCP). */ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } kuid_t sock_i_uid(struct sock *sk); unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) { u32 v = get_random_u32(); return v ?: 1; } static inline void sk_set_txhash(struct sock *sk) { /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline bool sk_rethink_txhash(struct sock *sk) { if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } return false; } static inline struct dst_entry * __sk_dst_get(const struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * sk_dst_get(const struct sock *sk) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); if (dst && !rcuref_get(&dst->__rcuref)) dst = NULL; rcu_read_unlock(); return dst; } static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst->ops->negative_advice) dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) { sk_rethink_txhash(sk); __dst_negative_advice(sk); } static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); WRITE_ONCE(sk->sk_dst_pending_confirm, 0); old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst))); dst_release(old_dst); } static inline void __sk_dst_reset(struct sock *sk) { __sk_dst_set(sk, NULL); } static inline void sk_dst_reset(struct sock *sk) { sk_dst_set(sk, NULL); } struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { if (!READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) { if (skb_get_dst_pending_confirm(skb)) { struct sock *sk = skb->sk; if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 0); neigh_confirm(n); } } bool sk_mc_loop(const struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline void sk_gso_disable(struct sock *sk) { sk->sk_gso_disabled = 1; sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) { if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, copy, &csum, from)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, offset); } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) return -EFAULT; return 0; } static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, int copy) { int err, offset = skb->len; err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), copy, offset); if (err) __skb_trim(skb, offset); return err; } static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, struct sk_buff *skb, struct page *page, int off, int copy) { int err; err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, copy, skb->len); if (err) return err; skb_len_add(skb, copy); sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } /** * sk_wmem_alloc_get - returns write allocations * @sk: socket * * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) - 1; } /** * sk_rmem_alloc_get - returns read allocations * @sk: socket * * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc); } /** * sk_has_allocations - check if allocations are outstanding * @sk: socket * * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } /** * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths:: * * CPU1 CPU2 * sys_select receive packet * ... ... * __add_wait_queue update tp->rcv_nxt * ... ... * tp->rcv_nxt check sock_def_readable * ... { * schedule rcu_read_lock(); * wq = rcu_dereference(sk->sk_wq); * if (wq && waitqueue_active(&wq->wait)) * wake_up_interruptible(&wq->wait) * ... * } * * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. * */ static inline bool skwq_has_sleeper(struct socket_wq *wq) { return wq && wq_has_sleeper(&wq->wait); } /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table * * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { if (!poll_does_not_wait(p)) { poll_wait(filp, &sock->wq.wait, p); /* We need to be sure we are in sync with the * socket flags modification. * * This memory barrier is paired in the wq_has_sleeper. */ smp_mb(); } } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { /* This pairs with WRITE_ONCE() in sk_set_txhash() */ u32 txhash = READ_ONCE(sk->sk_txhash); if (txhash) { skb->l4_hash = 1; skb->hash = txhash; } } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in * and play with them. * * Inlined as it's very short and called for pretty much every * packet ever received. */ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) { if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb_orphan(skb); skb->destructor = sock_efree; skb->sk = sk; return true; } return false; } static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk) { skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC)); if (skb) { if (sk_rmem_schedule(sk, skb, skb->truesize)) { skb_set_owner_r(skb, sk); return skb; } __kfree_skb(skb); } return NULL; } static inline void skb_prepare_for_gro(struct sk_buff *skb) { if (skb->destructor != sock_wfree) { skb_orphan(skb); return; } skb->slow_gro = 1; } void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); void sk_stop_timer(struct sock *sk, struct timer_list *timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return sock_queue_rcv_skb_reason(sk, skb, NULL); } int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically */ static inline int sock_error(struct sock *sk) { int err; /* Avoid an atomic operation for the common case. * This is racy since another cpu/thread can change sk_err under us. */ if (likely(data_race(!sk->sk_err))) return 0; err = xchg(&sk->sk_err, 0); return -err; } void sk_error_report(struct sock *sk); static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; } return amt; } /* Note: * We use sk->sk_wq_raw, from contexts knowing this * pointer is not NULL and cannot disappear/change. */ static inline void sk_set_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; clear_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_wake_async(const struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) { rcu_read_lock(); sock_wake_async(rcu_dereference(sk->sk_wq), how, band); rcu_read_unlock(); } } static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) { if (unlikely(sock_flag(sk, SOCK_FASYNC))) sock_wake_async(rcu_dereference(sk->sk_wq), how, band); } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at * minimum. */ #define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff))) #define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2) #define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE static inline void sk_stream_moderate_sndbuf(struct sock *sk) { u32 val; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); val = max_t(u32, val, sk_unused_reserved_mem(sk)); WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } /** * sk_page_frag - return an appropriate page_frag * @sk: socket * * Use the per task page_frag instead of the per socket one for * optimization when we know that we're in process context and own * everything that's associated with %current. * * Both direct reclaim and page faults can nest inside other * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * when users disable sk_use_task_frag. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { if (sk->sk_use_task_frag) return &current->task_frag; return &sk->sk_frag; } bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline gfp_t gfp_memcg_charge(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_rcvtimeo; } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_sndtimeo; } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); return v ?: 1; } /* Alas, with timeout socket operations are not restartable. * Compare this to poll(). */ static inline int sock_intr_errno(long timeo) { return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } struct sock_skb_cb { u32 dropcount; }; /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its * alignement guarantee. */ #define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ sizeof(struct sock_skb_cb))) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); atomic_add(segs, &sk->sk_drops); } static inline ktime_t sock_read_timestamp(struct sock *sk) { #if BITS_PER_LONG==32 unsigned int seq; ktime_t kt; do { seq = read_seqbegin(&sk->sk_stamp_seq); kt = sk->sk_stamp; } while (read_seqretry(&sk->sk_stamp_seq, seq)); return kt; #else return READ_ONCE(sk->sk_stamp); #endif } static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) { #if BITS_PER_LONG==32 write_seqlock(&sk->sk_stamp_seq); sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else WRITE_ONCE(sk->sk_stamp, kt); #endif } void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); u32 tsflags = READ_ONCE(sk->sk_tsflags); ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested * - software time stamp available and wanted * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb)) __sock_recv_wifi_status(msg, sk, skb); } void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVMARK)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) if (sk->sk_flags & FLAGS_RECV_CMSGS || READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @tsflags: timestamping flags to use * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __u8 *tx_flags, __u32 *tskey) { if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, __u8 *tx_flags) { _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); } static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) { _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } static inline bool sk_is_inet(const struct sock *sk) { int family = READ_ONCE(sk->sk_family); return family == AF_INET || family == AF_INET6; } static inline bool sk_is_tcp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; } static inline bool sk_is_udp(const struct sock *sk) { return sk_is_inet(sk) && sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP; } static inline bool sk_is_stream_unix(const struct sock *sk) { return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; } /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { #ifdef CONFIG_INET return skb->destructor == sock_pfree; #else return false; #endif /* CONFIG_INET */ } /* This helper checks if a socket is a full socket, * ie _not_ a timewait or request socket. */ static inline bool sk_fullsock(const struct sock *sk) { return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } static inline bool sk_is_refcounted(struct sock *sk) { /* Only full sockets have sk->sk_flags. */ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; } #endif return skb; } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ static inline bool sk_listener(const struct sock *sk) { return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap); bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern int sysctl_tstamp_allow_data; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; #define SKB_FRAG_PAGE_ORDER get_order(32768) DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) * Some wifi drivers need to tweak it to get more chunks. * They can use this helper from their ndo_start_xmit() */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device * index is either the same or that the socket is bound to an L3 * master device and the given device index is also enslaved to * that L3 master */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); int mdif; if (!bound_dev_if || bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); if (mdif && mdif == bound_dev_if) return true; return false; } void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_mark(struct sock *sk, u32 val); void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval); int sock_ioctl_inout(struct sock *sk, unsigned int cmd, void __user *arg, void *karg, size_t size); int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { if (sk->sk_prot->sock_is_readable) return sk->sk_prot->sock_is_readable(sk); return false; } #endif /* _SOCK_H */
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 /* * Parallel-port resource manager code. * * Authors: David Campbell <campbell@tirian.che.curtin.edu.au> * Tim Waugh <tim@cyberelk.demon.co.uk> * Jose Renau <renau@acm.org> * Philip Blundell <philb@gnu.org> * Andrea Arcangeli * * based on work by Grant Guenther <grant@torque.net> * and Philip Blundell * * Any part of this program may be used in documents licensed under * the GNU Free Documentation License, Version 1.1 or any later version * published by the Free Software Foundation. */ #undef PARPORT_DEBUG_SHARING /* undef for production */ #include <linux/module.h> #include <linux/string.h> #include <linux/threads.h> #include <linux/parport.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/kmod.h> #include <linux/device.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <asm/irq.h> #undef PARPORT_PARANOID #define PARPORT_DEFAULT_TIMESLICE (HZ/5) unsigned long parport_default_timeslice = PARPORT_DEFAULT_TIMESLICE; int parport_default_spintime = DEFAULT_SPIN_TIME; static LIST_HEAD(portlist); static DEFINE_SPINLOCK(parportlist_lock); /* list of all allocated ports, sorted by ->number */ static LIST_HEAD(all_ports); static DEFINE_SPINLOCK(full_list_lock); static DEFINE_MUTEX(registration_lock); /* What you can do to a port that's gone away.. */ static void dead_write_lines(struct parport *p, unsigned char b){} static unsigned char dead_read_lines(struct parport *p) { return 0; } static unsigned char dead_frob_lines(struct parport *p, unsigned char b, unsigned char c) { return 0; } static void dead_onearg(struct parport *p){} static void dead_initstate(struct pardevice *d, struct parport_state *s) { } static void dead_state(struct parport *p, struct parport_state *s) { } static size_t dead_write(struct parport *p, const void *b, size_t l, int f) { return 0; } static size_t dead_read(struct parport *p, void *b, size_t l, int f) { return 0; } static struct parport_operations dead_ops = { .write_data = dead_write_lines, /* data */ .read_data = dead_read_lines, .write_control = dead_write_lines, /* control */ .read_control = dead_read_lines, .frob_control = dead_frob_lines, .read_status = dead_read_lines, /* status */ .enable_irq = dead_onearg, /* enable_irq */ .disable_irq = dead_onearg, /* disable_irq */ .data_forward = dead_onearg, /* data_forward */ .data_reverse = dead_onearg, /* data_reverse */ .init_state = dead_initstate, /* init_state */ .save_state = dead_state, .restore_state = dead_state, .epp_write_data = dead_write, /* epp */ .epp_read_data = dead_read, .epp_write_addr = dead_write, .epp_read_addr = dead_read, .ecp_write_data = dead_write, /* ecp */ .ecp_read_data = dead_read, .ecp_write_addr = dead_write, .compat_write_data = dead_write, /* compat */ .nibble_read_data = dead_read, /* nibble */ .byte_read_data = dead_read, /* byte */ .owner = NULL, }; static struct device_type parport_device_type = { .name = "parport", }; static int is_parport(struct device *dev) { return dev->type == &parport_device_type; } static int parport_probe(struct device *dev) { struct parport_driver *drv; if (is_parport(dev)) return -ENODEV; drv = to_parport_driver(dev->driver); if (!drv->probe) { /* if driver has not defined a custom probe */ struct pardevice *par_dev = to_pardevice(dev); if (strcmp(par_dev->name, drv->name)) return -ENODEV; return 0; } /* if driver defined its own probe */ return drv->probe(to_pardevice(dev)); } static const struct bus_type parport_bus_type = { .name = "parport", .probe = parport_probe, }; int parport_bus_init(void) { return bus_register(&parport_bus_type); } void parport_bus_exit(void) { bus_unregister(&parport_bus_type); } /* * iterates through all the drivers registered with the bus and sends the port * details to the match_port callback of the driver, so that the driver can * know about the new port that just registered with the bus and decide if it * wants to use this new port. */ static int driver_check(struct device_driver *dev_drv, void *_port) { struct parport *port = _port; struct parport_driver *drv = to_parport_driver(dev_drv); if (drv->match_port) drv->match_port(port); return 0; } /* Call attach(port) for each registered driver. */ static void attach_driver_chain(struct parport *port) { /* caller has exclusive registration_lock */ /* * call the driver_check function of the drivers registered in * new device model */ bus_for_each_drv(&parport_bus_type, NULL, port, driver_check); } static int driver_detach(struct device_driver *_drv, void *_port) { struct parport *port = _port; struct parport_driver *drv = to_parport_driver(_drv); if (drv->detach) drv->detach(port); return 0; } /* Call detach(port) for each registered driver. */ static void detach_driver_chain(struct parport *port) { /* caller has exclusive registration_lock */ /* * call the detach function of the drivers registered in * new device model */ bus_for_each_drv(&parport_bus_type, NULL, port, driver_detach); } /* Ask kmod for some lowlevel drivers. */ static void get_lowlevel_driver(void) { /* * There is no actual module called this: you should set * up an alias for modutils. */ request_module("parport_lowlevel"); } /* * iterates through all the devices connected to the bus and sends the device * details to the match_port callback of the driver, so that the driver can * know what are all the ports that are connected to the bus and choose the * port to which it wants to register its device. */ static int port_check(struct device *dev, void *dev_drv) { struct parport_driver *drv = dev_drv; /* only send ports, do not send other devices connected to bus */ if (is_parport(dev)) drv->match_port(to_parport_dev(dev)); return 0; } /* * Iterates through all the devices connected to the bus and return 1 * if the device is a parallel port. */ static int port_detect(struct device *dev, void *dev_drv) { if (is_parport(dev)) return 1; return 0; } /** * __parport_register_driver - register a parallel port device driver * @drv: structure describing the driver * @owner: owner module of drv * @mod_name: module name string * * This can be called by a parallel port device driver in order * to receive notifications about ports being found in the * system, as well as ports no longer available. * * If devmodel is true then the new device model is used * for registration. * * The @drv structure is allocated by the caller and must not be * deallocated until after calling parport_unregister_driver(). * * If using the non device model: * The driver's attach() function may block. The port that * attach() is given will be valid for the duration of the * callback, but if the driver wants to take a copy of the * pointer it must call parport_get_port() to do so. Calling * parport_register_device() on that port will do this for you. * * The driver's detach() function may block. The port that * detach() is given will be valid for the duration of the * callback, but if the driver wants to take a copy of the * pointer it must call parport_get_port() to do so. * * * Returns 0 on success. The non device model will always succeeds. * but the new device model can fail and will return the error code. **/ int __parport_register_driver(struct parport_driver *drv, struct module *owner, const char *mod_name) { /* using device model */ int ret; /* initialize common driver fields */ drv->driver.name = drv->name; drv->driver.bus = &parport_bus_type; drv->driver.owner = owner; drv->driver.mod_name = mod_name; ret = driver_register(&drv->driver); if (ret) return ret; /* * check if bus has any parallel port registered, if * none is found then load the lowlevel driver. */ ret = bus_for_each_dev(&parport_bus_type, NULL, NULL, port_detect); if (!ret) get_lowlevel_driver(); mutex_lock(&registration_lock); if (drv->match_port) bus_for_each_dev(&parport_bus_type, NULL, drv, port_check); mutex_unlock(&registration_lock); return 0; } EXPORT_SYMBOL(__parport_register_driver); static int port_detach(struct device *dev, void *_drv) { struct parport_driver *drv = _drv; if (is_parport(dev) && drv->detach) drv->detach(to_parport_dev(dev)); return 0; } /** * parport_unregister_driver - deregister a parallel port device driver * @drv: structure describing the driver that was given to * parport_register_driver() * * This should be called by a parallel port device driver that * has registered itself using parport_register_driver() when it * is about to be unloaded. * * When it returns, the driver's attach() routine will no longer * be called, and for each port that attach() was called for, the * detach() routine will have been called. * * All the driver's attach() and detach() calls are guaranteed to have * finished by the time this function returns. **/ void parport_unregister_driver(struct parport_driver *drv) { mutex_lock(&registration_lock); bus_for_each_dev(&parport_bus_type, NULL, drv, port_detach); driver_unregister(&drv->driver); mutex_unlock(&registration_lock); } EXPORT_SYMBOL(parport_unregister_driver); static void free_port(struct device *dev) { int d; struct parport *port = to_parport_dev(dev); spin_lock(&full_list_lock); list_del(&port->full_list); spin_unlock(&full_list_lock); for (d = 0; d < 5; d++) { kfree(port->probe_info[d].class_name); kfree(port->probe_info[d].mfr); kfree(port->probe_info[d].model); kfree(port->probe_info[d].cmdset); kfree(port->probe_info[d].description); } kfree(port); } /** * parport_get_port - increment a port's reference count * @port: the port * * This ensures that a struct parport pointer remains valid * until the matching parport_put_port() call. **/ struct parport *parport_get_port(struct parport *port) { struct device *dev = get_device(&port->bus_dev); return to_parport_dev(dev); } EXPORT_SYMBOL(parport_get_port); void parport_del_port(struct parport *port) { device_unregister(&port->bus_dev); } EXPORT_SYMBOL(parport_del_port); /** * parport_put_port - decrement a port's reference count * @port: the port * * This should be called once for each call to parport_get_port(), * once the port is no longer needed. When the reference count reaches * zero (port is no longer used), free_port is called. **/ void parport_put_port(struct parport *port) { put_device(&port->bus_dev); } EXPORT_SYMBOL(parport_put_port); /** * parport_register_port - register a parallel port * @base: base I/O address * @irq: IRQ line * @dma: DMA channel * @ops: pointer to the port driver's port operations structure * * When a parallel port (lowlevel) driver finds a port that * should be made available to parallel port device drivers, it * should call parport_register_port(). The @base, @irq, and * @dma parameters are for the convenience of port drivers, and * for ports where they aren't meaningful needn't be set to * anything special. They can be altered afterwards by adjusting * the relevant members of the parport structure that is returned * and represents the port. They should not be tampered with * after calling parport_announce_port, however. * * If there are parallel port device drivers in the system that * have registered themselves using parport_register_driver(), * they are not told about the port at this time; that is done by * parport_announce_port(). * * The @ops structure is allocated by the caller, and must not be * deallocated before calling parport_remove_port(). * * If there is no memory to allocate a new parport structure, * this function will return %NULL. **/ struct parport *parport_register_port(unsigned long base, int irq, int dma, struct parport_operations *ops) { struct list_head *l; struct parport *tmp; int num; int device; int ret; tmp = kzalloc(sizeof(struct parport), GFP_KERNEL); if (!tmp) return NULL; /* Init our structure */ tmp->base = base; tmp->irq = irq; tmp->dma = dma; tmp->muxport = tmp->daisy = tmp->muxsel = -1; INIT_LIST_HEAD(&tmp->list); tmp->ops = ops; tmp->physport = tmp; rwlock_init(&tmp->cad_lock); spin_lock_init(&tmp->waitlist_lock); spin_lock_init(&tmp->pardevice_lock); tmp->ieee1284.mode = IEEE1284_MODE_COMPAT; tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE; sema_init(&tmp->ieee1284.irq, 0); tmp->spintime = parport_default_spintime; atomic_set(&tmp->ref_count, 1); /* Search for the lowest free parport number. */ spin_lock(&full_list_lock); num = 0; list_for_each(l, &all_ports) { struct parport *p = list_entry(l, struct parport, full_list); if (p->number != num++) break; } tmp->portnum = tmp->number = num; list_add_tail(&tmp->full_list, l); spin_unlock(&full_list_lock); /* * Now that the portnum is known finish doing the Init. */ dev_set_name(&tmp->bus_dev, "parport%d", tmp->portnum); tmp->bus_dev.bus = &parport_bus_type; tmp->bus_dev.release = free_port; tmp->bus_dev.type = &parport_device_type; tmp->name = dev_name(&tmp->bus_dev); for (device = 0; device < 5; device++) /* assume the worst */ tmp->probe_info[device].class = PARPORT_CLASS_LEGACY; ret = device_register(&tmp->bus_dev); if (ret) { put_device(&tmp->bus_dev); return NULL; } return tmp; } EXPORT_SYMBOL(parport_register_port); /** * parport_announce_port - tell device drivers about a parallel port * @port: parallel port to announce * * After a port driver has registered a parallel port with * parport_register_port, and performed any necessary * initialisation or adjustments, it should call * parport_announce_port() in order to notify all device drivers * that have called parport_register_driver(). Their attach() * functions will be called, with @port as the parameter. **/ void parport_announce_port(struct parport *port) { int i; #ifdef CONFIG_PARPORT_1284 /* Analyse the IEEE1284.3 topology of the port. */ parport_daisy_init(port); #endif if (!port->dev) pr_warn("%s: fix this legacy no-device port driver!\n", port->name); parport_proc_register(port); mutex_lock(&registration_lock); spin_lock_irq(&parportlist_lock); list_add_tail(&port->list, &portlist); for (i = 1; i < 3; i++) { struct parport *slave = port->slaves[i-1]; if (slave) list_add_tail(&slave->list, &portlist); } spin_unlock_irq(&parportlist_lock); /* Let drivers know that new port(s) has arrived. */ attach_driver_chain(port); for (i = 1; i < 3; i++) { struct parport *slave = port->slaves[i-1]; if (slave) attach_driver_chain(slave); } mutex_unlock(&registration_lock); } EXPORT_SYMBOL(parport_announce_port); /** * parport_remove_port - deregister a parallel port * @port: parallel port to deregister * * When a parallel port driver is forcibly unloaded, or a * parallel port becomes inaccessible, the port driver must call * this function in order to deal with device drivers that still * want to use it. * * The parport structure associated with the port has its * operations structure replaced with one containing 'null' * operations that return errors or just don't do anything. * * Any drivers that have registered themselves using * parport_register_driver() are notified that the port is no * longer accessible by having their detach() routines called * with @port as the parameter. **/ void parport_remove_port(struct parport *port) { int i; mutex_lock(&registration_lock); /* Spread the word. */ detach_driver_chain(port); #ifdef CONFIG_PARPORT_1284 /* Forget the IEEE1284.3 topology of the port. */ parport_daisy_fini(port); for (i = 1; i < 3; i++) { struct parport *slave = port->slaves[i-1]; if (!slave) continue; detach_driver_chain(slave); parport_daisy_fini(slave); } #endif port->ops = &dead_ops; spin_lock(&parportlist_lock); list_del_init(&port->list); for (i = 1; i < 3; i++) { struct parport *slave = port->slaves[i-1]; if (slave) list_del_init(&slave->list); } spin_unlock(&parportlist_lock); mutex_unlock(&registration_lock); parport_proc_unregister(port); for (i = 1; i < 3; i++) { struct parport *slave = port->slaves[i-1]; if (slave) parport_put_port(slave); } } EXPORT_SYMBOL(parport_remove_port); static void free_pardevice(struct device *dev) { struct pardevice *par_dev = to_pardevice(dev); kfree_const(par_dev->name); kfree(par_dev); } /** * parport_register_dev_model - register a device on a parallel port * @port: port to which the device is attached * @name: a name to refer to the device * @par_dev_cb: struct containing callbacks * @id: device number to be given to the device * * This function, called by parallel port device drivers, * declares that a device is connected to a port, and tells the * system all it needs to know. * * The struct pardev_cb contains pointer to callbacks. preemption * callback function, @preempt, is called when this device driver * has claimed access to the port but another device driver wants * to use it. It is given, @private, as its parameter, and should * return zero if it is willing for the system to release the port * to another driver on its behalf. If it wants to keep control of * the port it should return non-zero, and no action will be taken. * It is good manners for the driver to try to release the port at * the earliest opportunity after its preemption callback rejects a * preemption attempt. Note that if a preemption callback is happy * for preemption to go ahead, there is no need to release the * port; it is done automatically. This function may not block, as * it may be called from interrupt context. If the device driver * does not support preemption, @preempt can be %NULL. * * The wake-up ("kick") callback function, @wakeup, is called when * the port is available to be claimed for exclusive access; that * is, parport_claim() is guaranteed to succeed when called from * inside the wake-up callback function. If the driver wants to * claim the port it should do so; otherwise, it need not take * any action. This function may not block, as it may be called * from interrupt context. If the device driver does not want to * be explicitly invited to claim the port in this way, @wakeup can * be %NULL. * * The interrupt handler, @irq_func, is called when an interrupt * arrives from the parallel port. Note that if a device driver * wants to use interrupts it should use parport_enable_irq(), * and can also check the irq member of the parport structure * representing the port. * * The parallel port (lowlevel) driver is the one that has called * request_irq() and whose interrupt handler is called first. * This handler does whatever needs to be done to the hardware to * acknowledge the interrupt (for PC-style ports there is nothing * special to be done). It then tells the IEEE 1284 code about * the interrupt, which may involve reacting to an IEEE 1284 * event depending on the current IEEE 1284 phase. After this, * it calls @irq_func. Needless to say, @irq_func will be called * from interrupt context, and may not block. * * The %PARPORT_DEV_EXCL flag is for preventing port sharing, and * so should only be used when sharing the port with other device * drivers is impossible and would lead to incorrect behaviour. * Use it sparingly! Normally, @flags will be zero. * * This function returns a pointer to a structure that represents * the device on the port, or %NULL if there is not enough memory * to allocate space for that structure. **/ struct pardevice * parport_register_dev_model(struct parport *port, const char *name, const struct pardev_cb *par_dev_cb, int id) { struct pardevice *par_dev; const char *devname; int ret; if (port->physport->flags & PARPORT_FLAG_EXCL) { /* An exclusive device is registered. */ pr_err("%s: no more devices allowed\n", port->name); return NULL; } if (par_dev_cb->flags & PARPORT_DEV_LURK) { if (!par_dev_cb->preempt || !par_dev_cb->wakeup) { pr_info("%s: refused to register lurking device (%s) without callbacks\n", port->name, name); return NULL; } } if (par_dev_cb->flags & PARPORT_DEV_EXCL) { if (port->physport->devices) { /* * If a device is already registered and this new * device wants exclusive access, then no need to * continue as we can not grant exclusive access to * this device. */ pr_err("%s: cannot grant exclusive access for device %s\n", port->name, name); return NULL; } } if (!try_module_get(port->ops->owner)) return NULL; parport_get_port(port); par_dev = kzalloc(sizeof(*par_dev), GFP_KERNEL); if (!par_dev) goto err_put_port; par_dev->state = kzalloc(sizeof(*par_dev->state), GFP_KERNEL); if (!par_dev->state) goto err_put_par_dev; devname = kstrdup_const(name, GFP_KERNEL); if (!devname) goto err_free_par_dev; par_dev->name = devname; par_dev->port = port; par_dev->daisy = -1; par_dev->preempt = par_dev_cb->preempt; par_dev->wakeup = par_dev_cb->wakeup; par_dev->private = par_dev_cb->private; par_dev->flags = par_dev_cb->flags; par_dev->irq_func = par_dev_cb->irq_func; par_dev->waiting = 0; par_dev->timeout = 5 * HZ; par_dev->dev.parent = &port->bus_dev; par_dev->dev.bus = &parport_bus_type; ret = dev_set_name(&par_dev->dev, "%s.%d", devname, id); if (ret) goto err_free_devname; par_dev->dev.release = free_pardevice; par_dev->devmodel = true; ret = device_register(&par_dev->dev); if (ret) { kfree(par_dev->state); put_device(&par_dev->dev); goto err_put_port; } /* Chain this onto the list */ par_dev->prev = NULL; /* * This function must not run from an irq handler so we don' t need * to clear irq on the local CPU. -arca */ spin_lock(&port->physport->pardevice_lock); if (par_dev_cb->flags & PARPORT_DEV_EXCL) { if (port->physport->devices) { spin_unlock(&port->physport->pardevice_lock); pr_debug("%s: cannot grant exclusive access for device %s\n", port->name, name); kfree(par_dev->state); device_unregister(&par_dev->dev); goto err_put_port; } port->flags |= PARPORT_FLAG_EXCL; } par_dev->next = port->physport->devices; wmb(); /* * Make sure that tmp->next is written before it's * added to the list; see comments marked 'no locking * required' */ if (port->physport->devices) port->physport->devices->prev = par_dev; port->physport->devices = par_dev; spin_unlock(&port->physport->pardevice_lock); init_waitqueue_head(&par_dev->wait_q); par_dev->timeslice = parport_default_timeslice; par_dev->waitnext = NULL; par_dev->waitprev = NULL; /* * This has to be run as last thing since init_state may need other * pardevice fields. -arca */ port->ops->init_state(par_dev, par_dev->state); if (!test_and_set_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags)) { port->proc_device = par_dev; parport_device_proc_register(par_dev); } return par_dev; err_free_devname: kfree_const(devname); err_free_par_dev: kfree(par_dev->state); err_put_par_dev: if (!par_dev->devmodel) kfree(par_dev); err_put_port: parport_put_port(port); module_put(port->ops->owner); return NULL; } EXPORT_SYMBOL(parport_register_dev_model); /** * parport_unregister_device - deregister a device on a parallel port * @dev: pointer to structure representing device * * This undoes the effect of parport_register_device(). **/ void parport_unregister_device(struct pardevice *dev) { struct parport *port; #ifdef PARPORT_PARANOID if (!dev) { pr_err("%s: passed NULL\n", __func__); return; } #endif port = dev->port->physport; if (port->proc_device == dev) { port->proc_device = NULL; clear_bit(PARPORT_DEVPROC_REGISTERED, &port->devflags); parport_device_proc_unregister(dev); } if (port->cad == dev) { printk(KERN_DEBUG "%s: %s forgot to release port\n", port->name, dev->name); parport_release(dev); } spin_lock(&port->pardevice_lock); if (dev->next) dev->next->prev = dev->prev; if (dev->prev) dev->prev->next = dev->next; else port->devices = dev->next; if (dev->flags & PARPORT_DEV_EXCL) port->flags &= ~PARPORT_FLAG_EXCL; spin_unlock(&port->pardevice_lock); /* * Make sure we haven't left any pointers around in the wait * list. */ spin_lock_irq(&port->waitlist_lock); if (dev->waitprev || dev->waitnext || port->waithead == dev) { if (dev->waitprev) dev->waitprev->waitnext = dev->waitnext; else port->waithead = dev->waitnext; if (dev->waitnext) dev->waitnext->waitprev = dev->waitprev; else port->waittail = dev->waitprev; } spin_unlock_irq(&port->waitlist_lock); kfree(dev->state); device_unregister(&dev->dev); module_put(port->ops->owner); parport_put_port(port); } EXPORT_SYMBOL(parport_unregister_device); /** * parport_find_number - find a parallel port by number * @number: parallel port number * * This returns the parallel port with the specified number, or * %NULL if there is none. * * There is an implicit parport_get_port() done already; to throw * away the reference to the port that parport_find_number() * gives you, use parport_put_port(). */ struct parport *parport_find_number(int number) { struct parport *port, *result = NULL; if (list_empty(&portlist)) get_lowlevel_driver(); spin_lock(&parportlist_lock); list_for_each_entry(port, &portlist, list) { if (port->number == number) { result = parport_get_port(port); break; } } spin_unlock(&parportlist_lock); return result; } EXPORT_SYMBOL(parport_find_number); /** * parport_find_base - find a parallel port by base address * @base: base I/O address * * This returns the parallel port with the specified base * address, or %NULL if there is none. * * There is an implicit parport_get_port() done already; to throw * away the reference to the port that parport_find_base() * gives you, use parport_put_port(). */ struct parport *parport_find_base(unsigned long base) { struct parport *port, *result = NULL; if (list_empty(&portlist)) get_lowlevel_driver(); spin_lock(&parportlist_lock); list_for_each_entry(port, &portlist, list) { if (port->base == base) { result = parport_get_port(port); break; } } spin_unlock(&parportlist_lock); return result; } EXPORT_SYMBOL(parport_find_base); /** * parport_claim - claim access to a parallel port device * @dev: pointer to structure representing a device on the port * * This function will not block and so can be used from interrupt * context. If parport_claim() succeeds in claiming access to * the port it returns zero and the port is available to use. It * may fail (returning non-zero) if the port is in use by another * driver and that driver is not willing to relinquish control of * the port. **/ int parport_claim(struct pardevice *dev) { struct pardevice *oldcad; struct parport *port = dev->port->physport; unsigned long flags; if (port->cad == dev) { pr_info("%s: %s already owner\n", dev->port->name, dev->name); return 0; } /* Preempt any current device */ write_lock_irqsave(&port->cad_lock, flags); oldcad = port->cad; if (oldcad) { if (oldcad->preempt) { if (oldcad->preempt(oldcad->private)) goto blocked; port->ops->save_state(port, dev->state); } else goto blocked; if (port->cad != oldcad) { /* * I think we'll actually deadlock rather than * get here, but just in case.. */ pr_warn("%s: %s released port when preempted!\n", port->name, oldcad->name); if (port->cad) goto blocked; } } /* Can't fail from now on, so mark ourselves as no longer waiting. */ if (dev->waiting & 1) { dev->waiting = 0; /* Take ourselves out of the wait list again. */ spin_lock_irq(&port->waitlist_lock); if (dev->waitprev) dev->waitprev->waitnext = dev->waitnext; else port->waithead = dev->waitnext; if (dev->waitnext) dev->waitnext->waitprev = dev->waitprev; else port->waittail = dev->waitprev; spin_unlock_irq(&port->waitlist_lock); dev->waitprev = dev->waitnext = NULL; } /* Now we do the change of devices */ port->cad = dev; #ifdef CONFIG_PARPORT_1284 /* If it's a mux port, select it. */ if (dev->port->muxport >= 0) { /* FIXME */ port->muxsel = dev->port->muxport; } /* If it's a daisy chain device, select it. */ if (dev->daisy >= 0) { /* This could be lazier. */ if (!parport_daisy_select(port, dev->daisy, IEEE1284_MODE_COMPAT)) port->daisy = dev->daisy; } #endif /* IEEE1284.3 support */ /* Restore control registers */ port->ops->restore_state(port, dev->state); write_unlock_irqrestore(&port->cad_lock, flags); dev->time = jiffies; return 0; blocked: /* * If this is the first time we tried to claim the port, register an * interest. This is only allowed for devices sleeping in * parport_claim_or_block(), or those with a wakeup function. */ /* The cad_lock is still held for writing here */ if (dev->waiting & 2 || dev->wakeup) { spin_lock(&port->waitlist_lock); if (test_and_set_bit(0, &dev->waiting) == 0) { /* First add ourselves to the end of the wait list. */ dev->waitnext = NULL; dev->waitprev = port->waittail; if (port->waittail) { port->waittail->waitnext = dev; port->waittail = dev; } else port->waithead = port->waittail = dev; } spin_unlock(&port->waitlist_lock); } write_unlock_irqrestore(&port->cad_lock, flags); return -EAGAIN; } EXPORT_SYMBOL(parport_claim); /** * parport_claim_or_block - claim access to a parallel port device * @dev: pointer to structure representing a device on the port * * This behaves like parport_claim(), but will block if necessary * to wait for the port to be free. A return value of 1 * indicates that it slept; 0 means that it succeeded without * needing to sleep. A negative error code indicates failure. **/ int parport_claim_or_block(struct pardevice *dev) { int r; /* * Signal to parport_claim() that we can wait even without a * wakeup function. */ dev->waiting = 2; /* Try to claim the port. If this fails, we need to sleep. */ r = parport_claim(dev); if (r == -EAGAIN) { #ifdef PARPORT_DEBUG_SHARING printk(KERN_DEBUG "%s: parport_claim() returned -EAGAIN\n", dev->name); #endif /* * FIXME!!! Use the proper locking for dev->waiting, * and make this use the "wait_event_interruptible()" * interfaces. The cli/sti that used to be here * did nothing. * * See also parport_release() */ /* * If dev->waiting is clear now, an interrupt * gave us the port and we would deadlock if we slept. */ if (dev->waiting) { wait_event_interruptible(dev->wait_q, !dev->waiting); if (signal_pending(current)) return -EINTR; r = 1; } else { r = 0; #ifdef PARPORT_DEBUG_SHARING printk(KERN_DEBUG "%s: didn't sleep in parport_claim_or_block()\n", dev->name); #endif } #ifdef PARPORT_DEBUG_SHARING if (dev->port->physport->cad != dev) printk(KERN_DEBUG "%s: exiting parport_claim_or_block but %s owns port!\n", dev->name, dev->port->physport->cad ? dev->port->physport->cad->name : "nobody"); #endif } dev->waiting = 0; return r; } EXPORT_SYMBOL(parport_claim_or_block); /** * parport_release - give up access to a parallel port device * @dev: pointer to structure representing parallel port device * * This function cannot fail, but it should not be called without * the port claimed. Similarly, if the port is already claimed * you should not try claiming it again. **/ void parport_release(struct pardevice *dev) { struct parport *port = dev->port->physport; struct pardevice *pd; unsigned long flags; /* Make sure that dev is the current device */ write_lock_irqsave(&port->cad_lock, flags); if (port->cad != dev) { write_unlock_irqrestore(&port->cad_lock, flags); pr_warn("%s: %s tried to release parport when not owner\n", port->name, dev->name); return; } #ifdef CONFIG_PARPORT_1284 /* If this is on a mux port, deselect it. */ if (dev->port->muxport >= 0) { /* FIXME */ port->muxsel = -1; } /* If this is a daisy device, deselect it. */ if (dev->daisy >= 0) { parport_daisy_deselect_all(port); port->daisy = -1; } #endif port->cad = NULL; write_unlock_irqrestore(&port->cad_lock, flags); /* Save control registers */ port->ops->save_state(port, dev->state); /* * If anybody is waiting, find out who's been there longest and * then wake them up. (Note: no locking required) */ /* !!! LOCKING IS NEEDED HERE */ for (pd = port->waithead; pd; pd = pd->waitnext) { if (pd->waiting & 2) { /* sleeping in claim_or_block */ parport_claim(pd); if (waitqueue_active(&pd->wait_q)) wake_up_interruptible(&pd->wait_q); return; } else if (pd->wakeup) { pd->wakeup(pd->private); if (dev->port->cad) /* racy but no matter */ return; } else { pr_err("%s: don't know how to wake %s\n", port->name, pd->name); } } /* * Nobody was waiting, so walk the list to see if anyone is * interested in being woken up. (Note: no locking required) */ /* !!! LOCKING IS NEEDED HERE */ for (pd = port->devices; !port->cad && pd; pd = pd->next) { if (pd->wakeup && pd != dev) pd->wakeup(pd->private); } } EXPORT_SYMBOL(parport_release); irqreturn_t parport_irq_handler(int irq, void *dev_id) { struct parport *port = dev_id; parport_generic_irq(port); return IRQ_HANDLED; } EXPORT_SYMBOL(parport_irq_handler); MODULE_DESCRIPTION("Parallel-port resource manager"); MODULE_LICENSE("GPL");
82 75 75 7 7 75 2 75 75 75 7 7 13 7 19 19 19 6 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 Syncookies implementation for the Linux kernel * * Authors: * Glenn Griffin <ggriffin.kernel@gmail.com> * * Based on IPv4 implementation by Andi Kleen * linux/net/ipv4/syncookies.c */ #include <linux/tcp.h> #include <linux/random.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) static siphash_aligned_key_t syncookie6_secret[2]; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] * * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows * using higher values than ipv4 tcp syncookies. * The other values are chosen based on ethernet (1500 and 9k MTU), plus * one that accounts for common encap (PPPoe) overhead. Table must be sorted. */ static __u16 const msstab[] = { 1280 - 60, /* IPV6_MIN_MTU - 60 */ 1480 - 60, 1500 - 60, 9000 - 60, }; static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { const struct { struct in6_addr saddr; struct in6_addr daddr; u32 count; __be16 sport; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *saddr, .daddr = *daddr, .count = count, .sport = sport, .dport = dport }; net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); return siphash(&combined, offsetofend(typeof(combined), dport), &syncookie6_secret[c]); } static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq) { __u32 diff, count = tcp_cookie_time(); cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; } u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, __u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v6_init_sequence(iph, th, mssp); } int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v6_check); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct request_sock *req; struct dst_entry *dst; struct sock *ret = sk; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { refcount_inc(&skb->users); ireq->pktopts = skb; } /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); tcp_ao_syncookie(sk, skb, req, AF_INET6); /* * We need to lookup the dst_entry to get the correct window size. * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten * me if there is a preferred way. */ { struct in6_addr *final_p, final; struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = ireq->ir_iif; fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; fl6.flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } } req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; }
4869 4872 411 4876 688 4728 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> #include "blk.h" struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pdata_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */
3 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/mm.h> #include <asm/io.h> struct scatterlist { unsigned long page_link; unsigned int offset; unsigned int length; dma_addr_t dma_address; #ifdef CONFIG_NEED_SG_DMA_LENGTH unsigned int dma_length; #endif #ifdef CONFIG_NEED_SG_DMA_FLAGS unsigned int dma_flags; #endif }; /* * These macros should be used after a dma_map_sg call has been done * to get bus addresses of each of the SG entries and their lengths. * You should only work with the number of sg entries dma_map_sg * returns, or alternatively stop on the first sg_dma_len(sg) which * is 0. */ #define sg_dma_address(sg) ((sg)->dma_address) #ifdef CONFIG_NEED_SG_DMA_LENGTH #define sg_dma_len(sg) ((sg)->dma_length) #else #define sg_dma_len(sg) ((sg)->length) #endif struct sg_table { struct scatterlist *sgl; /* the list */ unsigned int nents; /* number of mapped entries */ unsigned int orig_nents; /* original size of list */ }; struct sg_append_table { struct sg_table sgt; /* The scatter list table */ struct scatterlist *prv; /* last populated sge in the table */ unsigned int total_nents; /* Total entries in the table */ }; /* * Notes on SG table design. * * We use the unsigned long page_link field in the scatterlist struct to place * the page pointer AND encode information about the sg table as well. The two * lower bits are reserved for this information. * * If bit 0 is set, then the page_link contains a pointer to the next sg * table list. Otherwise the next entry is at sg + 1. * * If bit 1 is set, then this sg entry is the last element in a list. * * See sg_next(). * */ #define SG_CHAIN 0x01UL #define SG_END 0x02UL /* * We overload the LSB of the page pointer to indicate whether it's * a valid sg entry, or whether it points to the start of a new scatterlist. * Those low bits are there for everyone! (thanks mason :-) */ #define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END) static inline unsigned int __sg_flags(struct scatterlist *sg) { return sg->page_link & SG_PAGE_LINK_MASK; } static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg) { return (struct scatterlist *)(sg->page_link & ~SG_PAGE_LINK_MASK); } static inline bool sg_is_chain(struct scatterlist *sg) { return __sg_flags(sg) & SG_CHAIN; } static inline bool sg_is_last(struct scatterlist *sg) { return __sg_flags(sg) & SG_END; } /** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry * @page: The page * * Description: * Assign page to sg entry. Also see sg_set_page(), the most commonly used * variant. * **/ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) { unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END); /* * In order for the low bit stealing approach to work, pages * must be aligned at a 32-bit boundary as a minimum. */ BUG_ON((unsigned long)page & SG_PAGE_LINK_MASK); #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif sg->page_link = page_link | (unsigned long) page; } /** * sg_set_page - Set sg entry to point at given page * @sg: SG entry * @page: The page * @len: Length of data * @offset: Offset into page * * Description: * Use this function to set an sg entry pointing at a page, never assign * the page directly. We encode sg table information in the lower bits * of the page pointer. See sg_page() for looking up the page belonging * to an sg entry. * **/ static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { sg_assign_page(sg, page); sg->offset = offset; sg->length = len; } /** * sg_set_folio - Set sg entry to point at given folio * @sg: SG entry * @folio: The folio * @len: Length of data * @offset: Offset into folio * * Description: * Use this function to set an sg entry pointing at a folio, never assign * the folio directly. We encode sg table information in the lower bits * of the folio pointer. See sg_page() for looking up the page belonging * to an sg entry. * **/ static inline void sg_set_folio(struct scatterlist *sg, struct folio *folio, size_t len, size_t offset) { WARN_ON_ONCE(len > UINT_MAX); WARN_ON_ONCE(offset > UINT_MAX); sg_assign_page(sg, &folio->page); sg->offset = offset; sg->length = len; } static inline struct page *sg_page(struct scatterlist *sg) { #ifdef CONFIG_DEBUG_SG BUG_ON(sg_is_chain(sg)); #endif return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK); } /** * sg_set_buf - Set sg entry to point at given data * @sg: SG entry * @buf: Data * @buflen: Data length * **/ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) { #ifdef CONFIG_DEBUG_SG BUG_ON(!virt_addr_valid(buf)); #endif sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); } /* * Loop over each sg element, following the pointer to a new list if necessary */ #define for_each_sg(sglist, sg, nr, __i) \ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg)) /* * Loop over each sg element in the given sg_table object. */ #define for_each_sgtable_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i) /* * Loop over each sg element in the given *DMA mapped* sg_table object. * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses * of the each element. */ #define for_each_sgtable_dma_sg(sgt, sg, i) \ for_each_sg((sgt)->sgl, sg, (sgt)->nents, i) static inline void __sg_chain(struct scatterlist *chain_sg, struct scatterlist *sgl) { /* * offset and length are unused for chain entry. Clear them. */ chain_sg->offset = 0; chain_sg->length = 0; /* * Set lowest bit to indicate a link pointer, and make sure to clear * the termination bit if it happens to be set. */ chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END; } /** * sg_chain - Chain two sglists together * @prv: First scatterlist * @prv_nents: Number of entries in prv * @sgl: Second scatterlist * * Description: * Links @prv@ and @sgl@ together, to form a longer scatterlist. * **/ static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { __sg_chain(&prv[prv_nents - 1], sgl); } /** * sg_mark_end - Mark the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Marks the passed in sg entry as the termination point for the sg * table. A call to sg_next() on this entry will return NULL. * **/ static inline void sg_mark_end(struct scatterlist *sg) { /* * Set termination bit, clear potential chain bit */ sg->page_link |= SG_END; sg->page_link &= ~SG_CHAIN; } /** * sg_unmark_end - Undo setting the end of the scatterlist * @sg: SG entryScatterlist * * Description: * Removes the termination marker from the given entry of the scatterlist. * **/ static inline void sg_unmark_end(struct scatterlist *sg) { sg->page_link &= ~SG_END; } /* * One 64-bit architectures there is a 4-byte padding in struct scatterlist * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA * flags bits to indicate when a specific dma address is a bus address or the * buffer may have been bounced via SWIOTLB. */ #ifdef CONFIG_NEED_SG_DMA_FLAGS #define SG_DMA_BUS_ADDRESS (1 << 0) #define SG_DMA_SWIOTLB (1 << 1) /** * sg_dma_is_bus_address - Return whether a given segment was marked * as a bus address * @sg: SG entry * * Description: * Returns true if sg_dma_mark_bus_address() has been called on * this segment. **/ static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_BUS_ADDRESS; } /** * sg_dma_mark_bus_address - Mark the scatterlist entry as a bus address * @sg: SG entry * * Description: * Marks the passed in sg entry to indicate that the dma_address is * a bus address and doesn't need to be unmapped. This should only be * used by dma_map_sg() implementations to mark bus addresses * so they can be properly cleaned up in dma_unmap_sg(). **/ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) { sg->dma_flags |= SG_DMA_BUS_ADDRESS; } /** * sg_unmark_bus_address - Unmark the scatterlist entry as a bus address * @sg: SG entry * * Description: * Clears the bus address mark. **/ static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { sg->dma_flags &= ~SG_DMA_BUS_ADDRESS; } /** * sg_dma_is_swiotlb - Return whether the scatterlist was marked for SWIOTLB * bouncing * @sg: SG entry * * Description: * Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all * elements may have been bounced, so the caller would have to check * individual SG entries with swiotlb_find_pool(). */ static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) { return sg->dma_flags & SG_DMA_SWIOTLB; } /** * sg_dma_mark_swiotlb - Mark the scatterlist for SWIOTLB bouncing * @sg: SG entry * * Description: * Marks a a scatterlist for SWIOTLB bounce. Not all SG entries may be * bounced. */ static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) { sg->dma_flags |= SG_DMA_SWIOTLB; } #else static inline bool sg_dma_is_bus_address(struct scatterlist *sg) { return false; } static inline void sg_dma_mark_bus_address(struct scatterlist *sg) { } static inline void sg_dma_unmark_bus_address(struct scatterlist *sg) { } static inline bool sg_dma_is_swiotlb(struct scatterlist *sg) { return false; } static inline void sg_dma_mark_swiotlb(struct scatterlist *sg) { } #endif /* CONFIG_NEED_SG_DMA_FLAGS */ /** * sg_phys - Return physical address of an sg entry * @sg: SG entry * * Description: * This calls page_to_phys() on the page in this sg entry, and adds the * sg offset. The caller must know that it is legal to call page_to_phys() * on the sg page. * **/ static inline dma_addr_t sg_phys(struct scatterlist *sg) { return page_to_phys(sg_page(sg)) + sg->offset; } /** * sg_virt - Return virtual address of an sg entry * @sg: SG entry * * Description: * This calls page_address() on the page in this sg entry, and adds the * sg offset. The caller must know that the sg page has a valid virtual * mapping. * **/ static inline void *sg_virt(struct scatterlist *sg) { return page_address(sg_page(sg)) + sg->offset; } /** * sg_init_marker - Initialize markers in sg table * @sgl: The SG table * @nents: Number of entries in table * **/ static inline void sg_init_marker(struct scatterlist *sgl, unsigned int nents) { sg_mark_end(&sgl[nents - 1]); } int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); int sg_split(struct scatterlist *in, const int in_mapped_nents, const off_t skip, const int nb_splits, const size_t *split_sizes, struct scatterlist **out, int *out_mapped_nents, gfp_t gfp_mask); typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t); typedef void (sg_free_fn)(struct scatterlist *, unsigned int); void __sg_free_table(struct sg_table *, unsigned int, unsigned int, sg_free_fn *, unsigned int); void sg_free_table(struct sg_table *); void sg_free_append_table(struct sg_append_table *sgt); int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int, struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *); int sg_alloc_table(struct sg_table *, unsigned int, gfp_t); int sg_alloc_append_table_from_pages(struct sg_append_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, unsigned int left_pages, gfp_t gfp_mask); int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, gfp_t gfp_mask); /** * sg_alloc_table_from_pages - Allocate and initialize an sg table from * an array of pages * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node. A user * may provide an offset at a start and a size of valid data in a buffer * specified by the page array. The returned sg table is released by * sg_free_table. * * Returns: * 0 on success, negative error on failure */ static inline int sg_alloc_table_from_pages(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, gfp_t gfp_mask) { return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset, size, UINT_MAX, gfp_mask); } #ifdef CONFIG_SGL_ALLOC struct scatterlist *sgl_alloc_order(unsigned long long length, unsigned int order, bool chainable, gfp_t gfp, unsigned int *nent_p); struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p); void sgl_free_n_order(struct scatterlist *sgl, int nents, int order); void sgl_free_order(struct scatterlist *sgl, int order); void sgl_free(struct scatterlist *sgl); #endif /* CONFIG_SGL_ALLOC */ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer); size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen); size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen); size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip); size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip); size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, size_t buflen, off_t skip); /* * Maximum number of entries that will be allocated in one piece, if * a list larger than this is required then chaining will be utilized. */ #define SG_MAX_SINGLE_ALLOC (PAGE_SIZE / sizeof(struct scatterlist)) /* * The maximum number of SG segments that we will put inside a * scatterlist (unless chaining is used). Should ideally fit inside a * single page, to avoid a higher order allocation. We could define this * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order. The * minimum value is 32 */ #define SG_CHUNK_SIZE 128 /* * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ #ifdef CONFIG_ARCH_NO_SG_CHAIN #define SG_MAX_SEGMENTS SG_CHUNK_SIZE #else #define SG_MAX_SEGMENTS 2048 #endif #ifdef CONFIG_SG_POOL void sg_free_table_chained(struct sg_table *table, unsigned nents_first_chunk); int sg_alloc_table_chained(struct sg_table *table, int nents, struct scatterlist *first_chunk, unsigned nents_first_chunk); #endif /* * sg page iterator * * Iterates over sg entries page-by-page. On each successful iteration, you * can call sg_page_iter_page(@piter) to get the current page. * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to * the page's page offset within the sg. The iteration will stop either when a * maximum number of sg entries was reached or a terminating sg * (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ unsigned int sg_pgoffset; /* page offset within the sg */ /* these are internal states, keep away */ unsigned int __nents; /* remaining sg entries */ int __pg_advance; /* nr pages to advance at the * next step */ }; /* * sg page iterator for DMA addresses * * This is the same as sg_page_iter however you can call * sg_page_iter_dma_address(@dma_iter) to get the page's DMA * address. sg_page_iter_page() cannot be called on this iterator. */ struct sg_dma_page_iter { struct sg_page_iter base; }; bool __sg_page_iter_next(struct sg_page_iter *piter); bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); /** * sg_page_iter_page - get the current page held by the page iterator * @piter: page iterator holding the page */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { return nth_page(sg_page(piter->sg), piter->sg_pgoffset); } /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. * @dma_iter: page iterator holding the page */ static inline dma_addr_t sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { return sg_dma_address(dma_iter->base.sg) + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** * for_each_sg_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_page() to get each page pointer. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) /** * for_each_sg_dma_page - iterate over the pages of the given sg list * @sglist: sglist to iterate over * @dma_iter: DMA page iterator to hold current page * @dma_nents: maximum number of sg entries to iterate over, this is the value * returned from dma_map_sg * @pgoffset: starting page offset (in pages) * * Callers may use sg_page_iter_dma_address() to get each page's DMA address. * In each loop it operates on PAGE_SIZE unit. */ #define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ pgoffset); \ __sg_page_iter_dma_next(dma_iter);) /** * for_each_sgtable_page - iterate over all pages in the sg_table object * @sgt: sg_table object to iterate over * @piter: page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all memory pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. */ #define for_each_sgtable_page(sgt, piter, pgoffset) \ for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset) /** * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object * @sgt: sg_table object to iterate over * @dma_iter: DMA page iterator to hold current page * @pgoffset: starting page offset (in pages) * * Iterates over the all DMA mapped pages in the buffer described by * a scatterlist stored in the given sg_table object. * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE * unit. */ #define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset) /* * Mapping sg iterator * * Iterates over sg entries mapping page-by-page. On each successful * iteration, @miter->page points to the mapped page and * @miter->length bytes of data can be accessed at @miter->addr. As * long as an iteration is enclosed between start and stop, the user * is free to choose control structure and when to stop. * * @miter->consumed is set to @miter->length on each iteration. It * can be adjusted if the user can't consume all the bytes in one go. * Also, a stopped iteration can be resumed by calling next on it. * This is useful when iteration needs to release all resources and * continue later (e.g. at the next interrupt). */ #define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */ #define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */ #define SG_MITER_FROM_SG (1 << 2) /* nop */ struct sg_mapping_iter { /* the following three fields can be accessed directly */ struct page *page; /* currently mapped page */ void *addr; /* pointer to the mapped area */ size_t length; /* length of the mapped area */ size_t consumed; /* number of consumed bytes */ struct sg_page_iter piter; /* page iterator */ /* these are internal states, keep away */ unsigned int __offset; /* offset within page */ unsigned int __remaining; /* remaining bytes on page */ unsigned int __flags; }; void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, unsigned int nents, unsigned int flags); bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset); bool sg_miter_next(struct sg_mapping_iter *miter); void sg_miter_stop(struct sg_mapping_iter *miter); #endif /* _LINUX_SCATTERLIST_H */
1 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV6_H_ #define _NF_TABLES_IPV6_H_ #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/ipv6.h> #include <net/netfilter/nf_tables.h> static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) { unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) { nft_set_pktinfo_unspec(pkt); return; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; } static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; int protohdr; u32 pkt_len; ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*ip6h), &_ip6h); if (!ip6h) return -1; if (ip6h->version != 6) return -1; pkt_len = ntohs(ip6h->payload_len); if (pkt_len + sizeof(*ip6h) > pkt->skb->len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; #else return -1; #endif } static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv6_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; unsigned short frag_off; unsigned int thoff = 0; struct inet6_dev *idev; struct ipv6hdr *ip6h; int protohdr; u32 pkt_len; if (!pskb_may_pull(pkt->skb, sizeof(*ip6h))) return -1; ip6h = ipv6_hdr(pkt->skb); if (ip6h->version != 6) goto inhdr_error; pkt_len = ntohs(ip6h->payload_len); if (pkt_len + sizeof(*ip6h) > pkt->skb->len) { idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) goto inhdr_error; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; inhdr_error: idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INHDRERRORS); return -1; #else return -1; #endif } #endif
7 7 9 2 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 // SPDX-License-Identifier: GPL-2.0-or-later /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: * Fixes: */ #define pr_fmt(fmt) "UDPLite: " fmt #include <linux/export.h> #include <linux/proc_fs.h> #include "udp_impl.h" struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); /* Designate sk as UDP-Lite socket */ static int udplite_sk_init(struct sock *sk) { udp_init_sock(sk); pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); return 0; } static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } static int udplite_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { .handler = udplite_rcv, .err_handler = udplite_err, .no_policy = 1, }; struct proto udplite_prot = { .name = "UDP-Lite", .owner = THIS_MODULE, .close = udp_lib_close, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udplite_sk_init, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .memory_allocated = &udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, }; EXPORT_SYMBOL(udplite_prot); static struct inet_protosw udplite4_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }; #ifdef CONFIG_PROC_FS static struct udp_seq_afinfo udplite4_seq_afinfo = { .family = AF_INET, .udp_table = &udplite_table, }; static int __net_init udplite4_proc_init_net(struct net *net) { if (!proc_create_net_data("udplite", 0444, net->proc_net, &udp_seq_ops, sizeof(struct udp_iter_state), &udplite4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udplite4_proc_exit_net(struct net *net) { remove_proc_entry("udplite", net->proc_net); } static struct pernet_operations udplite4_net_ops = { .init = udplite4_proc_init_net, .exit = udplite4_proc_exit_net, }; static __init int udplite4_proc_init(void) { return register_pernet_subsys(&udplite4_net_ops); } #else static inline int udplite4_proc_init(void) { return 0; } #endif void __init udplite4_register(void) { udp_table_init(&udplite_table, "UDP-Lite"); if (proto_register(&udplite_prot, 1)) goto out_register_err; if (inet_add_protocol(&udplite_protocol, IPPROTO_UDPLITE) < 0) goto out_unregister_proto; inet_register_protosw(&udplite4_protosw); if (udplite4_proc_init()) pr_err("%s: Cannot register /proc!\n", __func__); return; out_unregister_proto: proto_unregister(&udplite_prot); out_register_err: pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__); }
44 32 12 7 50 66 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xdp #if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_XDP_H #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/tracepoint.h> #include <linux/bpf.h> #include <net/xdp.h> #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ FN(DROP) \ FN(PASS) \ FN(TX) \ FN(REDIRECT) #define __XDP_ACT_TP_FN(x) \ TRACE_DEFINE_ENUM(XDP_##x); #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), TP_ARGS(dev, xdp, act), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = act; __entry->ifindex = dev->ifindex; ), TP_printk("prog_id=%d action=%s ifindex=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex) ); TRACE_EVENT(xdp_bulk_tx, TP_PROTO(const struct net_device *dev, int sent, int drops, int err), TP_ARGS(dev, sent, drops, err), TP_STRUCT__entry( __field(int, ifindex) __field(u32, act) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->ifindex = dev->ifindex; __entry->act = XDP_TX; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", __entry->ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #ifndef __DEVMAP_OBJ_TYPE #define __DEVMAP_OBJ_TYPE struct _bpf_dtab_netdev { struct net_device *dev; }; #endif /* __DEVMAP_OBJ_TYPE */ DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), TP_fast_assign( u32 ifindex = 0, map_index = index; if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { /* Just leave to_ifindex to 0 if do broadcast redirect, * as tgt will be NULL. */ if (tgt) ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { ifindex = index; map_index = 0; } __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; __entry->to_ifindex = ifindex; __entry->map_id = map_id; __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_err(dev, xdp, to, err) \ trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) #define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, enum bpf_map_type map_type, u32 map_id, u32 index), TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats), TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) __field(unsigned int, xdp_redirect) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int to_cpu), TP_ARGS(map_id, processed, drops, to_cpu), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, to_cpu) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->to_cpu = to_cpu; ), TP_printk("enqueue" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " to_cpu=%d", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->to_cpu) ); TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err), TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( __field(int, from_ifindex) __field(u32, act) __field(int, to_ifindex) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ #include <net/xdp_priv.h> #define __MEM_TYPE_MAP(FN) \ FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); #define __MEM_TYPE_SYM_FN(x) \ { MEM_TYPE_##x, #x }, #define __MEM_TYPE_SYM_TAB \ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, TP_PROTO(const struct xdp_mem_allocator *xa), TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; ), TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator ) ); TRACE_EVENT(mem_connect, TP_PROTO(const struct xdp_mem_allocator *xa, const struct xdp_rxq_info *rxq), TP_ARGS(xa, rxq), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) __field(const struct xdp_rxq_info *, rxq) __field(int, ifindex) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; __entry->rxq = rxq; __entry->ifindex = rxq->dev->ifindex; ), TP_printk("mem_id=%d mem_type=%s allocator=%p" " ifindex=%d", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator, __entry->ifindex ) ); TRACE_EVENT(mem_return_failed, TP_PROTO(const struct xdp_mem_info *mem, const struct page *page), TP_ARGS(mem, page), TP_STRUCT__entry( __field(const struct page *, page) __field(u32, mem_id) __field(u32, mem_type) ), TP_fast_assign( __entry->page = page; __entry->mem_id = mem->id; __entry->mem_type = mem->type; ), TP_printk("mem_id=%d mem_type=%s page=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->page ) ); TRACE_EVENT(bpf_xdp_link_attach_failed, TP_PROTO(const char *msg), TP_ARGS(msg), TP_STRUCT__entry( __string(msg, msg) ), TP_fast_assign( __assign_str(msg); ), TP_printk("errmsg=%s", __get_str(msg)) ); #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h>
123 80 122 111 1 2 112 114 114 3 111 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 * Phillip Lougher <phillip@squashfs.org.uk> * * decompressor.c */ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "decompressor.h" #include "squashfs.h" #include "page_actor.h" /* * This file (and decompressor.h) implements a decompressor framework for * Squashfs, allowing multiple decompressors to be easily supported */ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; #ifndef CONFIG_SQUASHFS_LZ4 static const struct squashfs_decompressor squashfs_lz4_comp_ops = { NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0 }; #endif #ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_comp_ops = { NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; #endif #ifndef CONFIG_SQUASHFS_XZ static const struct squashfs_decompressor squashfs_xz_comp_ops = { NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 }; #endif #ifndef CONFIG_SQUASHFS_ZLIB static const struct squashfs_decompressor squashfs_zlib_comp_ops = { NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 }; #endif #ifndef CONFIG_SQUASHFS_ZSTD static const struct squashfs_decompressor squashfs_zstd_comp_ops = { NULL, NULL, NULL, NULL, ZSTD_COMPRESSION, "zstd", 0 }; #endif static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, &squashfs_lz4_comp_ops, &squashfs_lzo_comp_ops, &squashfs_xz_comp_ops, &squashfs_lzma_unsupported_comp_ops, &squashfs_zstd_comp_ops, &squashfs_unknown_comp_ops }; const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) { int i; for (i = 0; decompressor[i]->id; i++) if (id == decompressor[i]->id) break; return decompressor[i]; } static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; void *buffer = NULL, *comp_opts; struct squashfs_page_actor *actor = NULL; int length = 0; /* * Read decompressor specific options from file system if present */ if (SQUASHFS_COMP_OPTS(flags)) { buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buffer == NULL) { comp_opts = ERR_PTR(-ENOMEM); goto out; } actor = squashfs_page_actor_init(&buffer, 1, 0); if (actor == NULL) { comp_opts = ERR_PTR(-ENOMEM); goto out; } length = squashfs_read_data(sb, sizeof(struct squashfs_super_block), 0, NULL, actor); if (length < 0) { comp_opts = ERR_PTR(length); goto out; } } comp_opts = squashfs_comp_opts(msblk, buffer, length); out: kfree(actor); kfree(buffer); return comp_opts; } void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; void *stream, *comp_opts = get_comp_opts(sb, flags); if (IS_ERR(comp_opts)) return comp_opts; stream = msblk->thread_ops->create(msblk, comp_opts); if (IS_ERR(stream)) kfree(comp_opts); return stream; }
2 2 2 2 2 2 2 2 158 158 49 50 149 149 20 20 1022 72 2 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API for algorithms (i.e., low-level API). * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/workqueue.h> #include "internal.h" static LIST_HEAD(crypto_template_list); #ifdef CONFIG_CRYPTO_MANAGER_EXTRA_TESTS DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test); EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test); #endif static inline void crypto_check_module_sig(struct module *mod) { if (fips_enabled && mod && !module_sig_ok(mod)) panic("Module %s signature verification failed in FIPS mode\n", module_name(mod)); } static int crypto_check_alg(struct crypto_alg *alg) { crypto_check_module_sig(alg->cra_module); if (!alg->cra_name[0] || !alg->cra_driver_name[0]) return -EINVAL; if (alg->cra_alignmask & (alg->cra_alignmask + 1)) return -EINVAL; /* General maximums for all algs. */ if (alg->cra_alignmask > MAX_ALGAPI_ALIGNMASK) return -EINVAL; if (alg->cra_blocksize > MAX_ALGAPI_BLOCKSIZE) return -EINVAL; /* Lower maximums for specific alg types. */ if (!alg->cra_type && (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) == CRYPTO_ALG_TYPE_CIPHER) { if (alg->cra_alignmask > MAX_CIPHER_ALIGNMASK) return -EINVAL; if (alg->cra_blocksize > MAX_CIPHER_BLOCKSIZE) return -EINVAL; } if (alg->cra_priority < 0) return -EINVAL; refcount_set(&alg->cra_refcnt, 1); return 0; } static void crypto_free_instance(struct crypto_instance *inst) { inst->alg.cra_type->free(inst); } static void crypto_destroy_instance_workfn(struct work_struct *w) { struct crypto_instance *inst = container_of(w, struct crypto_instance, free_work); struct crypto_template *tmpl = inst->tmpl; crypto_free_instance(inst); crypto_tmpl_put(tmpl); } static void crypto_destroy_instance(struct crypto_alg *alg) { struct crypto_instance *inst = container_of(alg, struct crypto_instance, alg); INIT_WORK(&inst->free_work, crypto_destroy_instance_workfn); schedule_work(&inst->free_work); } /* * This function adds a spawn to the list secondary_spawns which * will be used at the end of crypto_remove_spawns to unregister * instances, unless the spawn happens to be one that is depended * on by the new algorithm (nalg in crypto_remove_spawns). * * This function is also responsible for resurrecting any algorithms * in the dependency chain of nalg by unsetting n->dead. */ static struct list_head *crypto_more_spawns(struct crypto_alg *alg, struct list_head *stack, struct list_head *top, struct list_head *secondary_spawns) { struct crypto_spawn *spawn, *n; spawn = list_first_entry_or_null(stack, struct crypto_spawn, list); if (!spawn) return NULL; n = list_prev_entry(spawn, list); list_move(&spawn->list, secondary_spawns); if (list_is_last(&n->list, stack)) return top; n = list_next_entry(n, list); if (!spawn->dead) n->dead = false; return &n->inst->alg.cra_users; } static void crypto_remove_instance(struct crypto_instance *inst, struct list_head *list) { struct crypto_template *tmpl = inst->tmpl; if (crypto_is_dead(&inst->alg)) return; inst->alg.cra_flags |= CRYPTO_ALG_DEAD; if (!tmpl || !crypto_tmpl_get(tmpl)) return; list_move(&inst->alg.cra_list, list); hlist_del(&inst->list); inst->alg.cra_destroy = crypto_destroy_instance; BUG_ON(!list_empty(&inst->alg.cra_users)); } /* * Given an algorithm alg, remove all algorithms that depend on it * through spawns. If nalg is not null, then exempt any algorithms * that is depended on by nalg. This is useful when nalg itself * depends on alg. */ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg) { u32 new_type = (nalg ?: alg)->cra_flags; struct crypto_spawn *spawn, *n; LIST_HEAD(secondary_spawns); struct list_head *spawns; LIST_HEAD(stack); LIST_HEAD(top); spawns = &alg->cra_users; list_for_each_entry_safe(spawn, n, spawns, list) { if ((spawn->alg->cra_flags ^ new_type) & spawn->mask) continue; list_move(&spawn->list, &top); } /* * Perform a depth-first walk starting from alg through * the cra_users tree. The list stack records the path * from alg to the current spawn. */ spawns = &top; do { while (!list_empty(spawns)) { struct crypto_instance *inst; spawn = list_first_entry(spawns, struct crypto_spawn, list); inst = spawn->inst; list_move(&spawn->list, &stack); spawn->dead = !spawn->registered || &inst->alg != nalg; if (!spawn->registered) break; BUG_ON(&inst->alg == alg); if (&inst->alg == nalg) break; spawns = &inst->alg.cra_users; /* * Even if spawn->registered is true, the * instance itself may still be unregistered. * This is because it may have failed during * registration. Therefore we still need to * make the following test. * * We may encounter an unregistered instance here, since * an instance's spawns are set up prior to the instance * being registered. An unregistered instance will have * NULL ->cra_users.next, since ->cra_users isn't * properly initialized until registration. But an * unregistered instance cannot have any users, so treat * it the same as ->cra_users being empty. */ if (spawns->next == NULL) break; } } while ((spawns = crypto_more_spawns(alg, &stack, &top, &secondary_spawns))); /* * Remove all instances that are marked as dead. Also * complete the resurrection of the others by moving them * back to the cra_users list. */ list_for_each_entry_safe(spawn, n, &secondary_spawns, list) { if (!spawn->dead) list_move(&spawn->list, &spawn->alg->cra_users); else if (spawn->registered) crypto_remove_instance(spawn->inst, list); } } EXPORT_SYMBOL_GPL(crypto_remove_spawns); static void crypto_alg_finish_registration(struct crypto_alg *alg, bool fulfill_requests, struct list_head *algs_to_put) { struct crypto_alg *q; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) continue; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) { struct crypto_larval *larval = (void *)q; /* * Check to see if either our generic name or * specific name can satisfy the name requested * by the larval entry q. */ if (strcmp(alg->cra_name, q->cra_name) && strcmp(alg->cra_driver_name, q->cra_name)) continue; if (larval->adult) continue; if ((q->cra_flags ^ alg->cra_flags) & larval->mask) continue; if (fulfill_requests && crypto_mod_get(alg)) larval->adult = alg; else larval->adult = ERR_PTR(-EAGAIN); continue; } if (strcmp(alg->cra_name, q->cra_name)) continue; if (strcmp(alg->cra_driver_name, q->cra_driver_name) && q->cra_priority > alg->cra_priority) continue; crypto_remove_spawns(q, algs_to_put, alg); } crypto_notify(CRYPTO_MSG_ALG_LOADED, alg); } static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg) { struct crypto_larval *larval; if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) || IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) || (alg->cra_flags & CRYPTO_ALG_INTERNAL)) return NULL; /* No self-test needed */ larval = crypto_larval_alloc(alg->cra_name, alg->cra_flags | CRYPTO_ALG_TESTED, 0); if (IS_ERR(larval)) return larval; larval->adult = crypto_mod_get(alg); if (!larval->adult) { kfree(larval); return ERR_PTR(-ENOENT); } refcount_set(&larval->alg.cra_refcnt, 1); memcpy(larval->alg.cra_driver_name, alg->cra_driver_name, CRYPTO_MAX_ALG_NAME); larval->alg.cra_priority = alg->cra_priority; return larval; } static struct crypto_larval * __crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put) { struct crypto_alg *q; struct crypto_larval *larval; int ret = -EAGAIN; if (crypto_is_dead(alg)) goto err; INIT_LIST_HEAD(&alg->cra_users); ret = -EEXIST; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) goto err; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) { if (!strcmp(alg->cra_driver_name, q->cra_driver_name)) goto err; continue; } if (!strcmp(q->cra_driver_name, alg->cra_name) || !strcmp(q->cra_driver_name, alg->cra_driver_name) || !strcmp(q->cra_name, alg->cra_driver_name)) goto err; } larval = crypto_alloc_test_larval(alg); if (IS_ERR(larval)) goto out; list_add(&alg->cra_list, &crypto_alg_list); if (larval) { /* No cheating! */ alg->cra_flags &= ~CRYPTO_ALG_TESTED; list_add(&larval->alg.cra_list, &crypto_alg_list); } else { alg->cra_flags |= CRYPTO_ALG_TESTED; crypto_alg_finish_registration(alg, true, algs_to_put); } out: return larval; err: larval = ERR_PTR(ret); goto out; } void crypto_alg_tested(const char *name, int err) { struct crypto_larval *test; struct crypto_alg *alg; struct crypto_alg *q; LIST_HEAD(list); bool best; down_write(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { if (crypto_is_moribund(q) || !crypto_is_larval(q)) continue; test = (struct crypto_larval *)q; if (!strcmp(q->cra_driver_name, name)) goto found; } pr_err("alg: Unexpected test result for %s: %d\n", name, err); goto unlock; found: q->cra_flags |= CRYPTO_ALG_DEAD; alg = test->adult; if (list_empty(&alg->cra_list)) goto complete; if (err == -ECANCELED) alg->cra_flags |= CRYPTO_ALG_FIPS_INTERNAL; else if (err) goto complete; else alg->cra_flags &= ~CRYPTO_ALG_FIPS_INTERNAL; alg->cra_flags |= CRYPTO_ALG_TESTED; /* * If a higher-priority implementation of the same algorithm is * currently being tested, then don't fulfill request larvals. */ best = true; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (crypto_is_moribund(q) || !crypto_is_larval(q)) continue; if (strcmp(alg->cra_name, q->cra_name)) continue; if (q->cra_priority > alg->cra_priority) { best = false; break; } } crypto_alg_finish_registration(alg, best, &list); complete: complete_all(&test->completion); unlock: up_write(&crypto_alg_sem); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_alg_tested); void crypto_remove_final(struct list_head *list) { struct crypto_alg *alg; struct crypto_alg *n; list_for_each_entry_safe(alg, n, list, cra_list) { list_del_init(&alg->cra_list); crypto_alg_put(alg); } } EXPORT_SYMBOL_GPL(crypto_remove_final); int crypto_register_alg(struct crypto_alg *alg) { struct crypto_larval *larval; LIST_HEAD(algs_to_put); bool test_started = false; int err; alg->cra_flags &= ~CRYPTO_ALG_DEAD; err = crypto_check_alg(alg); if (err) return err; down_write(&crypto_alg_sem); larval = __crypto_register_alg(alg, &algs_to_put); if (!IS_ERR_OR_NULL(larval)) { test_started = crypto_boot_test_finished(); larval->test_started = test_started; } up_write(&crypto_alg_sem); if (IS_ERR(larval)) return PTR_ERR(larval); if (test_started) crypto_wait_for_test(larval); crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_alg); static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list) { if (unlikely(list_empty(&alg->cra_list))) return -ENOENT; alg->cra_flags |= CRYPTO_ALG_DEAD; list_del_init(&alg->cra_list); crypto_remove_spawns(alg, list, NULL); return 0; } void crypto_unregister_alg(struct crypto_alg *alg) { int ret; LIST_HEAD(list); down_write(&crypto_alg_sem); ret = crypto_remove_alg(alg, &list); up_write(&crypto_alg_sem); if (WARN(ret, "Algorithm %s is not registered", alg->cra_driver_name)) return; if (WARN_ON(refcount_read(&alg->cra_refcnt) != 1)) return; if (alg->cra_destroy) alg->cra_destroy(alg); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_unregister_alg); int crypto_register_algs(struct crypto_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_alg(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_alg(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_algs); void crypto_unregister_algs(struct crypto_alg *algs, int count) { int i; for (i = 0; i < count; i++) crypto_unregister_alg(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_algs); int crypto_register_template(struct crypto_template *tmpl) { struct crypto_template *q; int err = -EEXIST; down_write(&crypto_alg_sem); crypto_check_module_sig(tmpl->module); list_for_each_entry(q, &crypto_template_list, list) { if (q == tmpl) goto out; } list_add(&tmpl->list, &crypto_template_list); err = 0; out: up_write(&crypto_alg_sem); return err; } EXPORT_SYMBOL_GPL(crypto_register_template); int crypto_register_templates(struct crypto_template *tmpls, int count) { int i, err; for (i = 0; i < count; i++) { err = crypto_register_template(&tmpls[i]); if (err) goto out; } return 0; out: for (--i; i >= 0; --i) crypto_unregister_template(&tmpls[i]); return err; } EXPORT_SYMBOL_GPL(crypto_register_templates); void crypto_unregister_template(struct crypto_template *tmpl) { struct crypto_instance *inst; struct hlist_node *n; struct hlist_head *list; LIST_HEAD(users); down_write(&crypto_alg_sem); BUG_ON(list_empty(&tmpl->list)); list_del_init(&tmpl->list); list = &tmpl->instances; hlist_for_each_entry(inst, list, list) { int err = crypto_remove_alg(&inst->alg, &users); BUG_ON(err); } up_write(&crypto_alg_sem); hlist_for_each_entry_safe(inst, n, list, list) { BUG_ON(refcount_read(&inst->alg.cra_refcnt) != 1); crypto_free_instance(inst); } crypto_remove_final(&users); } EXPORT_SYMBOL_GPL(crypto_unregister_template); void crypto_unregister_templates(struct crypto_template *tmpls, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_template(&tmpls[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_templates); static struct crypto_template *__crypto_lookup_template(const char *name) { struct crypto_template *q, *tmpl = NULL; down_read(&crypto_alg_sem); list_for_each_entry(q, &crypto_template_list, list) { if (strcmp(q->name, name)) continue; if (unlikely(!crypto_tmpl_get(q))) continue; tmpl = q; break; } up_read(&crypto_alg_sem); return tmpl; } struct crypto_template *crypto_lookup_template(const char *name) { return try_then_request_module(__crypto_lookup_template(name), "crypto-%s", name); } EXPORT_SYMBOL_GPL(crypto_lookup_template); int crypto_register_instance(struct crypto_template *tmpl, struct crypto_instance *inst) { struct crypto_larval *larval; struct crypto_spawn *spawn; u32 fips_internal = 0; LIST_HEAD(algs_to_put); int err; err = crypto_check_alg(&inst->alg); if (err) return err; inst->alg.cra_module = tmpl->module; inst->alg.cra_flags |= CRYPTO_ALG_INSTANCE; down_write(&crypto_alg_sem); larval = ERR_PTR(-EAGAIN); for (spawn = inst->spawns; spawn;) { struct crypto_spawn *next; if (spawn->dead) goto unlock; next = spawn->next; spawn->inst = inst; spawn->registered = true; fips_internal |= spawn->alg->cra_flags; crypto_mod_put(spawn->alg); spawn = next; } inst->alg.cra_flags |= (fips_internal & CRYPTO_ALG_FIPS_INTERNAL); larval = __crypto_register_alg(&inst->alg, &algs_to_put); if (IS_ERR(larval)) goto unlock; else if (larval) larval->test_started = true; hlist_add_head(&inst->list, &tmpl->instances); inst->tmpl = tmpl; unlock: up_write(&crypto_alg_sem); if (IS_ERR(larval)) return PTR_ERR(larval); if (larval) crypto_wait_for_test(larval); crypto_remove_final(&algs_to_put); return 0; } EXPORT_SYMBOL_GPL(crypto_register_instance); void crypto_unregister_instance(struct crypto_instance *inst) { LIST_HEAD(list); down_write(&crypto_alg_sem); crypto_remove_spawns(&inst->alg, &list, NULL); crypto_remove_instance(inst, &list); up_write(&crypto_alg_sem); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_unregister_instance); int crypto_grab_spawn(struct crypto_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { struct crypto_alg *alg; int err = -EAGAIN; if (WARN_ON_ONCE(inst == NULL)) return -EINVAL; /* Allow the result of crypto_attr_alg_name() to be passed directly */ if (IS_ERR(name)) return PTR_ERR(name); alg = crypto_find_alg(name, spawn->frontend, type | CRYPTO_ALG_FIPS_INTERNAL, mask); if (IS_ERR(alg)) return PTR_ERR(alg); down_write(&crypto_alg_sem); if (!crypto_is_moribund(alg)) { list_add(&spawn->list, &alg->cra_users); spawn->alg = alg; spawn->mask = mask; spawn->next = inst->spawns; inst->spawns = spawn; inst->alg.cra_flags |= (alg->cra_flags & CRYPTO_ALG_INHERITED_FLAGS); err = 0; } up_write(&crypto_alg_sem); if (err) crypto_mod_put(alg); return err; } EXPORT_SYMBOL_GPL(crypto_grab_spawn); void crypto_drop_spawn(struct crypto_spawn *spawn) { if (!spawn->alg) /* not yet initialized? */ return; down_write(&crypto_alg_sem); if (!spawn->dead) list_del(&spawn->list); up_write(&crypto_alg_sem); if (!spawn->registered) crypto_mod_put(spawn->alg); } EXPORT_SYMBOL_GPL(crypto_drop_spawn); static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn) { struct crypto_alg *alg = ERR_PTR(-EAGAIN); struct crypto_alg *target; bool shoot = false; down_read(&crypto_alg_sem); if (!spawn->dead) { alg = spawn->alg; if (!crypto_mod_get(alg)) { target = crypto_alg_get(alg); shoot = true; alg = ERR_PTR(-EAGAIN); } } up_read(&crypto_alg_sem); if (shoot) { crypto_shoot_alg(target); crypto_alg_put(target); } return alg; } struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = ERR_PTR(-EINVAL); if (unlikely((alg->cra_flags ^ type) & mask)) goto out_put_alg; tfm = __crypto_alloc_tfm(alg, type, mask); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm); void *crypto_spawn_tfm2(struct crypto_spawn *spawn) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = crypto_create_tfm(alg, spawn->frontend); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm2); int crypto_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_register_notifier); int crypto_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_unregister_notifier); struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb) { struct rtattr *rta = tb[0]; struct crypto_attr_type *algt; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*algt)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_TYPE) return ERR_PTR(-EINVAL); algt = RTA_DATA(rta); return algt; } EXPORT_SYMBOL_GPL(crypto_get_attr_type); /** * crypto_check_attr_type() - check algorithm type and compute inherited mask * @tb: the template parameters * @type: the algorithm type the template would be instantiated as * @mask_ret: (output) the mask that should be passed to crypto_grab_*() * to restrict the flags of any inner algorithms * * Validate that the algorithm type the user requested is compatible with the * one the template would actually be instantiated as. E.g., if the user is * doing crypto_alloc_shash("cbc(aes)", ...), this would return an error because * the "cbc" template creates an "skcipher" algorithm, not an "shash" algorithm. * * Also compute the mask to use to restrict the flags of any inner algorithms. * * Return: 0 on success; -errno on failure */ int crypto_check_attr_type(struct rtattr **tb, u32 type, u32 *mask_ret) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); if ((algt->type ^ type) & algt->mask) return -EINVAL; *mask_ret = crypto_algt_inherited_mask(algt); return 0; } EXPORT_SYMBOL_GPL(crypto_check_attr_type); const char *crypto_attr_alg_name(struct rtattr *rta) { struct crypto_attr_alg *alga; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*alga)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_ALG) return ERR_PTR(-EINVAL); alga = RTA_DATA(rta); alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0; return alga->name; } EXPORT_SYMBOL_GPL(crypto_attr_alg_name); int crypto_inst_setname(struct crypto_instance *inst, const char *name, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name, alg->cra_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name, alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return 0; } EXPORT_SYMBOL_GPL(crypto_inst_setname); void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen) { INIT_LIST_HEAD(&queue->list); queue->backlog = &queue->list; queue->qlen = 0; queue->max_qlen = max_qlen; } EXPORT_SYMBOL_GPL(crypto_init_queue); int crypto_enqueue_request(struct crypto_queue *queue, struct crypto_async_request *request) { int err = -EINPROGRESS; if (unlikely(queue->qlen >= queue->max_qlen)) { if (!(request->flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) { err = -ENOSPC; goto out; } err = -EBUSY; if (queue->backlog == &queue->list) queue->backlog = &request->list; } queue->qlen++; list_add_tail(&request->list, &queue->list); out: return err; } EXPORT_SYMBOL_GPL(crypto_enqueue_request); void crypto_enqueue_request_head(struct crypto_queue *queue, struct crypto_async_request *request) { if (unlikely(queue->qlen >= queue->max_qlen)) queue->backlog = queue->backlog->prev; queue->qlen++; list_add(&request->list, &queue->list); } EXPORT_SYMBOL_GPL(crypto_enqueue_request_head); struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue) { struct list_head *request; if (unlikely(!queue->qlen)) return NULL; queue->qlen--; if (queue->backlog != &queue->list) queue->backlog = queue->backlog->next; request = queue->list.next; list_del(request); return list_entry(request, struct crypto_async_request, list); } EXPORT_SYMBOL_GPL(crypto_dequeue_request); static inline void crypto_inc_byte(u8 *a, unsigned int size) { u8 *b = (a + size); u8 c; for (; size; size--) { c = *--b + 1; *b = c; if (c) break; } } void crypto_inc(u8 *a, unsigned int size) { __be32 *b = (__be32 *)(a + size); u32 c; if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || IS_ALIGNED((unsigned long)b, __alignof__(*b))) for (; size >= 4; size -= 4) { c = be32_to_cpu(*--b) + 1; *b = cpu_to_be32(c); if (likely(c)) return; } crypto_inc_byte(a, size); } EXPORT_SYMBOL_GPL(crypto_inc); unsigned int crypto_alg_extsize(struct crypto_alg *alg) { return alg->cra_ctxsize + (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1)); } EXPORT_SYMBOL_GPL(crypto_alg_extsize); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_type_has_alg); static void __init crypto_start_tests(void) { if (!IS_BUILTIN(CONFIG_CRYPTO_ALGAPI)) return; if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return; for (;;) { struct crypto_larval *larval = NULL; struct crypto_alg *q; down_write(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { struct crypto_larval *l; if (!crypto_is_larval(q)) continue; l = (void *)q; if (!crypto_is_test_larval(l)) continue; if (l->test_started) continue; l->test_started = true; larval = l; break; } up_write(&crypto_alg_sem); if (!larval) break; crypto_wait_for_test(larval); } set_crypto_boot_test_finished(); } static int __init crypto_algapi_init(void) { crypto_init_proc(); crypto_start_tests(); return 0; } static void __exit crypto_algapi_exit(void) { crypto_exit_proc(); } /* * We run this at late_initcall so that all the built-in algorithms * have had a chance to register themselves first. */ late_initcall(crypto_algapi_init); module_exit(crypto_algapi_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cryptographic algorithms API"); MODULE_SOFTDEP("pre: cryptomgr");
1 1 1 24 7 17 15 17 5 3 2 2 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 // SPDX-License-Identifier: GPL-2.0 /* * Support for async notification of waitid */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "cancel.h" #include "waitid.h" #include "../kernel/exit.h" static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) struct io_waitid { struct file *file; int which; pid_t upid; int options; atomic_t refs; struct wait_queue_head *head; struct siginfo __user *infop; struct waitid_info info; }; static void io_waitid_free(struct io_kiocb *req) { struct io_waitid_async *iwa = req->async_data; put_pid(iwa->wo.wo_pid); kfree(req->async_data); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } #ifdef CONFIG_COMPAT static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; bool ret; infop = (struct compat_siginfo __user *) iw->infop; if (!user_write_access_begin(infop, sizeof(*infop))) return false; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &infop->si_code, Efault); unsafe_put_user(iw->info.pid, &infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &infop->si_uid, Efault); unsafe_put_user(iw->info.status, &infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } #endif static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); bool ret; if (!iw->infop) return true; #ifdef CONFIG_COMPAT if (req->ctx->compat) return io_waitid_compat_copy_si(iw, signo); #endif if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; unsafe_put_user(signo, &iw->infop->si_signo, Efault); unsafe_put_user(0, &iw->infop->si_errno, Efault); unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault); unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault); unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault); unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault); ret = true; done: user_write_access_end(); return ret; Efault: ret = false; goto done; } static int io_waitid_finish(struct io_kiocb *req, int ret) { int signo = 0; if (ret > 0) { signo = SIGCHLD; ret = 0; } if (!io_waitid_copy_si(req, signo)) ret = -EFAULT; io_waitid_free(req); return ret; } static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); lockdep_assert_held(&req->ctx->uring_lock); hlist_del_init(&req->hash_node); ret = io_waitid_finish(req, ret); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); io_req_task_complete(req, &ts); } static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; /* * Mark us canceled regardless of ownership. This will prevent a * potential retry from a spurious wakeup. */ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs); /* claim ownership */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return false; spin_lock_irq(&iw->head->lock); list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); return true; } int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { struct hlist_node *tmp; struct io_kiocb *req; int nr = 0; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) return -ENOENT; io_ring_submit_lock(ctx, issue_flags); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { if (req->cqe.user_data != cd->data && !(cd->flags & IORING_ASYNC_CANCEL_ANY)) continue; if (__io_waitid_cancel(ctx, req)) nr++; if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) break; } io_ring_submit_unlock(ctx, issue_flags); if (nr) return nr; return -ENOENT; } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; bool found = false; lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { if (!io_match_task_safe(req, task, cancel_all)) continue; hlist_del_init(&req->hash_node); __io_waitid_cancel(ctx, req); found = true; } return found; } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; if (!atomic_sub_return(1, &iw->refs)) return false; /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); remove_wait_queue(iw->head, &iwa->wo.child_wait); return true; } static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) { struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; io_tw_lock(ctx, ts); ret = __do_wait(&iwa->wo); /* * If we get -ERESTARTSYS here, we need to re-arm and check again * to ensure we get another callback. If the retry works, then we can * just remove ourselves from the waitqueue again and finish the * request. */ if (unlikely(ret == -ERESTARTSYS)) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); /* Don't retry if cancel found it meanwhile */ ret = -ECANCELED; if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) { iw->head = &current->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* retry armed, drop our ref */ io_waitid_drop_issue_ref(req); return; } remove_wait_queue(iw->head, &iwa->wo.child_wait); } } io_waitid_complete(req, ret); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo); struct io_kiocb *req = iwa->req; struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct task_struct *p = key; if (!pid_child_should_wake(wo, p)) return 0; /* cancel is in progress */ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) return 1; req->io_task_work.func = io_waitid_cb; io_req_task_work_add(req); list_del_init(&wait->entry); return 1; } int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); return 0; } int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_ring_ctx *ctx = req->ctx; struct io_waitid_async *iwa; int ret; if (io_alloc_async_data(req)) return -ENOMEM; iwa = req->async_data; iwa->req = req; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) goto done; /* * Mark the request as busy upfront, in case we're racing with the * wakeup. If we are, then we'll notice when we drop this initial * reference again after arming. */ atomic_set(&iw->refs, 1); /* * Cancel must hold the ctx lock, so there's no risk of cancelation * finding us until a) we remain on the list, and b) the lock is * dropped. We only need to worry about racing with the wakeup * callback. */ io_ring_submit_lock(ctx, issue_flags); hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); iwa->wo.child_wait.private = req->task; iw->head = &current->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); ret = __do_wait(&iwa->wo); if (ret == -ERESTARTSYS) { /* * Nobody else grabbed a reference, it'll complete when we get * a waitqueue callback, or if someone cancels it. */ if (!io_waitid_drop_issue_ref(req)) { io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } /* * Wakeup triggered, racing with us. It was prevented from * completing because of that, queue up the tw to do that. */ io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } hlist_del_init(&req->hash_node); remove_wait_queue(iw->head, &iwa->wo.child_wait); ret = io_waitid_finish(req, ret); io_ring_submit_unlock(ctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
1397 1400 1397 1 1 7 1 1 1 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 // SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #include "ipvlan.h" static unsigned int ipvlan_netid __read_mostly; struct ipvlan_netns { unsigned int ipvl_nf_hook_refcnt; }; static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb, struct net_device *dev) { struct ipvl_addr *addr = NULL; struct ipvl_port *port; int addr_type; void *lyr3h; if (!dev || !netif_is_ipvlan_port(dev)) goto out; port = ipvlan_port_get_rcu(dev); if (!port || port->mode != IPVLAN_MODE_L3S) goto out; lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type); if (!lyr3h) goto out; addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); out: return addr; } static struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, u16 proto) { struct ipvl_addr *addr; struct net_device *sdev; addr = ipvlan_skb_to_addr(skb, dev); if (!addr) goto out; sdev = addr->master->dev; switch (proto) { case AF_INET: { struct iphdr *ip4h = ip_hdr(skb); int err; err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, ip4h->tos, sdev); if (unlikely(err)) goto out; break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct dst_entry *dst; struct ipv6hdr *ip6h = ipv6_hdr(skb); int flags = RT6_LOOKUP_F_HAS_SADDR; struct flowi6 fl6 = { .flowi6_iif = sdev->ifindex, .daddr = ip6h->daddr, .saddr = ip6h->saddr, .flowlabel = ip6_flowinfo(ip6h), .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; skb_dst_drop(skb); dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, skb, flags); skb_dst_set(skb, dst); break; } #endif default: break; } out: return skb; } static const struct l3mdev_ops ipvl_l3mdev_ops = { .l3mdev_l3_rcv = ipvlan_l3_rcv, }; static unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct ipvl_addr *addr; unsigned int len; addr = ipvlan_skb_to_addr(skb, skb->dev); if (!addr) goto out; skb->dev = addr->master->dev; skb->skb_iif = skb->dev->ifindex; #if IS_ENABLED(CONFIG_IPV6) if (addr->atype == IPVL_IPV6) IP6CB(skb)->iif = skb->dev->ifindex; #endif len = skb->len + ETH_HLEN; ipvlan_count_rx(addr->master, len, true, false); out: return NF_ACCEPT; } static const struct nf_hook_ops ipvl_nfops[] = { { .hook = ipvlan_nf_input, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = INT_MAX, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = ipvlan_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = INT_MAX, }, #endif }; static int ipvlan_register_nf_hook(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); int err = 0; if (!vnet->ipvl_nf_hook_refcnt) { err = nf_register_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); if (!err) vnet->ipvl_nf_hook_refcnt = 1; } else { vnet->ipvl_nf_hook_refcnt++; } return err; } static void ipvlan_unregister_nf_hook(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); if (WARN_ON(!vnet->ipvl_nf_hook_refcnt)) return; vnet->ipvl_nf_hook_refcnt--; if (!vnet->ipvl_nf_hook_refcnt) nf_unregister_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); } void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { struct ipvlan_netns *old_vnet; ASSERT_RTNL(); old_vnet = net_generic(oldnet, ipvlan_netid); if (!old_vnet->ipvl_nf_hook_refcnt) return; ipvlan_register_nf_hook(newnet); ipvlan_unregister_nf_hook(oldnet); } static void ipvlan_ns_exit(struct net *net) { struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid); if (WARN_ON_ONCE(vnet->ipvl_nf_hook_refcnt)) { vnet->ipvl_nf_hook_refcnt = 0; nf_unregister_net_hooks(net, ipvl_nfops, ARRAY_SIZE(ipvl_nfops)); } } static struct pernet_operations ipvlan_net_ops = { .id = &ipvlan_netid, .size = sizeof(struct ipvlan_netns), .exit = ipvlan_ns_exit, }; int ipvlan_l3s_init(void) { return register_pernet_subsys(&ipvlan_net_ops); } void ipvlan_l3s_cleanup(void) { unregister_pernet_subsys(&ipvlan_net_ops); } int ipvlan_l3s_register(struct ipvl_port *port) { struct net_device *dev = port->dev; int ret; ASSERT_RTNL(); ret = ipvlan_register_nf_hook(read_pnet(&port->pnet)); if (!ret) { dev->l3mdev_ops = &ipvl_l3mdev_ops; dev->priv_flags |= IFF_L3MDEV_RX_HANDLER; } return ret; } void ipvlan_l3s_unregister(struct ipvl_port *port) { struct net_device *dev = port->dev; ASSERT_RTNL(); dev->priv_flags &= ~IFF_L3MDEV_RX_HANDLER; ipvlan_unregister_nf_hook(read_pnet(&port->pnet)); dev->l3mdev_ops = NULL; }
5 5 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 /* * Copyright (c) 2014 Intel Corporation. All rights reserved. * Copyright (c) 2014 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "iwpm_util.h" static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser"; u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; static int iwpm_user_pid = IWPM_PID_UNDEFINED; static atomic_t echo_nlmsg_seq; /** * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid * * Returns true if the pid is greater than zero, otherwise returns false */ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } /** * iwpm_register_pid - Send a netlink query to userspace * to get the iwarp port mapper pid * @pm_msg: Contains driver info to send to the userspace port mapper * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_REG_PID_SEQ] * [IWPM_NLA_REG_IF_NAME] * [IWPM_NLA_REG_IBDEV_NAME] * [IWPM_NLA_REG_ULIB_NAME] */ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) { struct sk_buff *skb = NULL; struct iwpm_nlmsg_request *nlmsg_request = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; if (iwpm_check_registration(nl_client, IWPM_REG_VALID) || iwpm_user_pid == IWPM_PID_UNAVAILABLE) return 0; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto pid_query_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); if (!nlmsg_request) { err_str = "Unable to allocate netlink request"; goto pid_query_error; } msg_seq = atomic_read(&echo_nlmsg_seq); /* fill in the pid request message */ err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ); if (ret) goto pid_query_error; ret = ibnl_put_attr(skb, nlh, IFNAMSIZ, pm_msg->if_name, IWPM_NLA_REG_IF_NAME); if (ret) goto pid_query_error; ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE, pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME); if (ret) goto pid_query_error; ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE, (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME); if (ret) goto pid_query_error; nlmsg_end(skb, nlh); pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; err_str = "Unable to send a nlmsg"; goto pid_query_error; } nlmsg_request->req_buffer = pm_msg; ret = iwpm_wait_complete_req(nlmsg_request); return ret; pid_query_error: pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } /** * iwpm_add_mapping - Send a netlink add mapping request to * the userspace port mapper * @pm_msg: Contains the local ip/tcp address info to send * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] * [IWPM_NLA_MANAGE_FLAGS] * * If the request is successful, the pm_msg stores * the port mapper response (mapped address info) */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { struct sk_buff *skb = NULL; struct iwpm_nlmsg_request *nlmsg_request = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; if (!iwpm_valid_pid()) return 0; if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { err_str = "Unregistered port mapper client"; goto add_mapping_error; } skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto add_mapping_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); if (!nlmsg_request) { err_str = "Unable to allocate netlink request"; goto add_mapping_error; } msg_seq = atomic_read(&echo_nlmsg_seq); /* fill in the add mapping message */ err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MANAGE_MAPPING_SEQ); if (ret) goto add_mapping_error; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); if (ret) goto add_mapping_error; /* If flags are required and we're not V4, then return a quiet error */ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { ret = -EINVAL; goto add_mapping_error_nowarn; } if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, IWPM_NLA_MANAGE_FLAGS); if (ret) goto add_mapping_error; } nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; err_str = "Unable to send a nlmsg"; goto add_mapping_error; } ret = iwpm_wait_complete_req(nlmsg_request); return ret; add_mapping_error: pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); add_mapping_error_nowarn: dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } /** * iwpm_add_and_query_mapping - Process the port mapper response to * iwpm_add_and_query_mapping request * @pm_msg: Contains the local ip/tcp address info to send * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_QUERY_MAPPING_SEQ] * [IWPM_NLA_QUERY_LOCAL_ADDR] * [IWPM_NLA_QUERY_REMOTE_ADDR] * [IWPM_NLA_QUERY_FLAGS] */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { struct sk_buff *skb = NULL; struct iwpm_nlmsg_request *nlmsg_request = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; if (!iwpm_valid_pid()) return 0; if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) { err_str = "Unregistered port mapper client"; goto query_mapping_error; } ret = -ENOMEM; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto query_mapping_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL); if (!nlmsg_request) { err_str = "Unable to allocate netlink request"; goto query_mapping_error; } msg_seq = atomic_read(&echo_nlmsg_seq); /* fill in the query message */ err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_QUERY_MAPPING_SEQ); if (ret) goto query_mapping_error; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR); if (ret) goto query_mapping_error; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); if (ret) goto query_mapping_error; /* If flags are required and we're not V4, then return a quite error */ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { ret = -EINVAL; goto query_mapping_error_nowarn; } if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, IWPM_NLA_QUERY_FLAGS); if (ret) goto query_mapping_error; } nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; goto query_mapping_error; } ret = iwpm_wait_complete_req(nlmsg_request); return ret; query_mapping_error: pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); query_mapping_error_nowarn: dev_kfree_skb(skb); if (nlmsg_request) iwpm_free_nlmsg_request(&nlmsg_request->kref); return ret; } /** * iwpm_remove_mapping - Send a netlink remove mapping request * to the userspace port mapper * * @local_addr: Local ip/tcp address to remove * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] */ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; if (!iwpm_valid_pid()) return 0; if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) { err_str = "Unregistered port mapper client"; goto remove_mapping_error; } skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client); if (!skb) { ret = -ENOMEM; err_str = "Unable to create a nlmsg"; goto remove_mapping_error; } msg_seq = atomic_read(&echo_nlmsg_seq); nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MANAGE_MAPPING_SEQ); if (ret) goto remove_mapping_error; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), local_addr, IWPM_NLA_MANAGE_ADDR); if (ret) goto remove_mapping_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; err_str = "Unable to send a nlmsg"; goto remove_mapping_error; } iwpm_print_sockaddr(local_addr, "remove_mapping: Local sockaddr:"); return 0; remove_mapping_error: pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client); if (skb) dev_kfree_skb_any(skb); return ret; } /* netlink attribute policy for the received response to register pid request */ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 }, [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING, .len = IWPM_DEVNAME_SIZE - 1 }, [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING, .len = IWPM_ULIBNAME_SIZE - 1 }, [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 }, [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } }; /** * iwpm_register_pid_cb - Process the port mapper response to * iwpm_register_pid query * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * If successful, the function receives the userspace port mapper pid * which is used in future communication with the port mapper */ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct iwpm_nlmsg_request *nlmsg_request = NULL; struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX]; struct iwpm_dev_data *pm_msg; char *dev_name, *iwpm_name; u32 msg_seq; u8 nl_client; u16 iwpm_version; const char *msg_type = "Register Pid response"; if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX, resp_reg_policy, nltb, msg_type)) return -EINVAL; msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", __func__, msg_seq); return -EINVAL; } pm_msg = nlmsg_request->req_buffer; nl_client = nlmsg_request->nl_client; dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]); iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]); iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]); /* check device name, ulib name and version */ if (strcmp(pm_msg->dev_name, dev_name) || strcmp(iwpm_ulib_name, iwpm_name) || iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Incorrect info (dev = %s name = %s version = %u)\n", __func__, dev_name, iwpm_name, iwpm_version); nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; goto register_pid_response_exit; } iwpm_user_pid = cb->nlh->nlmsg_pid; iwpm_ulib_version = iwpm_version; if (iwpm_ulib_version < IWPM_UABI_VERSION) pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", __func__, iwpm_user_pid); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); iwpm_set_registration(nl_client, IWPM_REG_VALID); register_pid_response_exit: nlmsg_request->request_done = 1; /* always for found nlmsg_request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); up(&nlmsg_request->sem); return 0; } /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, [IWPM_NLA_RMANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } }; /** * iwpm_add_mapping_cb - Process the port mapper response to * iwpm_add_mapping request * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct iwpm_sa_data *pm_msg; struct iwpm_nlmsg_request *nlmsg_request = NULL; struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX]; struct sockaddr_storage *local_sockaddr; struct sockaddr_storage *mapped_sockaddr; const char *msg_type; u32 msg_seq; msg_type = "Add Mapping response"; if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX, resp_add_policy, nltb, msg_type)) return -EINVAL; atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", __func__, msg_seq); return -EINVAL; } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]); mapped_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]); if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; goto add_mapping_response_exit; } if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) { pr_info("%s: Sockaddr family doesn't match the requested one\n", __func__); nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; goto add_mapping_response_exit; } memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr, sizeof(*mapped_sockaddr)); iwpm_print_sockaddr(&pm_msg->loc_addr, "add_mapping: Local sockaddr:"); iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, "add_mapping: Mapped local sockaddr:"); add_mapping_response_exit: nlmsg_request->request_done = 1; /* always for found request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); up(&nlmsg_request->sem); return 0; } /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, [IWPM_NLA_RQUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } }; /** * iwpm_add_and_query_mapping_cb - Process the port mapper response to * iwpm_add_and_query_mapping request * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct iwpm_sa_data *pm_msg; struct iwpm_nlmsg_request *nlmsg_request = NULL; struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; struct sockaddr_storage *local_sockaddr, *remote_sockaddr; struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; const char *msg_type; u32 msg_seq; u16 err_code; msg_type = "Query Mapping response"; if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, resp_query_policy, nltb, msg_type)) return -EINVAL; atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", __func__, msg_seq); return -EINVAL; } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]); if (err_code == IWPM_REMOTE_QUERY_REJECT) { pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n", __func__, cb->nlh->nlmsg_pid, msg_seq); nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT; } if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) || iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) { pr_info("%s: Incorrect local sockaddr\n", __func__); nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; goto query_mapping_response_exit; } if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { pr_info("%s: Sockaddr family doesn't match the requested one\n", __func__); nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; goto query_mapping_response_exit; } memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr, sizeof(*mapped_loc_sockaddr)); memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr, sizeof(*mapped_rem_sockaddr)); iwpm_print_sockaddr(&pm_msg->loc_addr, "query_mapping: Local sockaddr:"); iwpm_print_sockaddr(&pm_msg->mapped_loc_addr, "query_mapping: Mapped local sockaddr:"); iwpm_print_sockaddr(&pm_msg->rem_addr, "query_mapping: Remote sockaddr:"); iwpm_print_sockaddr(&pm_msg->mapped_rem_addr, "query_mapping: Mapped remote sockaddr:"); query_mapping_response_exit: nlmsg_request->request_done = 1; /* always for found request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); up(&nlmsg_request->sem); return 0; } /** * iwpm_remote_info_cb - Process remote connecting peer address info, which * the port mapper has received from the connecting peer * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * Stores the IPv4/IPv6 address info in a hash table */ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX]; struct sockaddr_storage *local_sockaddr, *remote_sockaddr; struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr; struct iwpm_remote_info *rem_info; const char *msg_type; u8 nl_client; int ret = -EINVAL; msg_type = "Remote Mapping info"; if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX, resp_query_policy, nltb, msg_type)) return ret; nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); local_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]); if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family || mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) { pr_info("%s: Sockaddr family doesn't match the requested one\n", __func__); return ret; } rem_info = kzalloc(sizeof(struct iwpm_remote_info), GFP_ATOMIC); if (!rem_info) { ret = -ENOMEM; return ret; } memcpy(&rem_info->mapped_loc_sockaddr, mapped_loc_sockaddr, sizeof(struct sockaddr_storage)); memcpy(&rem_info->remote_sockaddr, remote_sockaddr, sizeof(struct sockaddr_storage)); memcpy(&rem_info->mapped_rem_sockaddr, mapped_rem_sockaddr, sizeof(struct sockaddr_storage)); rem_info->nl_client = nl_client; iwpm_add_remote_info(rem_info); iwpm_print_sockaddr(local_sockaddr, "remote_info: Local sockaddr:"); iwpm_print_sockaddr(mapped_loc_sockaddr, "remote_info: Mapped local sockaddr:"); iwpm_print_sockaddr(remote_sockaddr, "remote_info: Remote sockaddr:"); iwpm_print_sockaddr(mapped_rem_sockaddr, "remote_info: Mapped remote sockaddr:"); return ret; } /* netlink attribute policy for the received request for mapping info */ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING, .len = IWPM_ULIBNAME_SIZE - 1 }, [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } }; /** * iwpm_mapping_info_cb - Process a notification that the userspace * port mapper daemon is started * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * Using the received port mapper pid, send all the local mapping * info records to the userspace port mapper */ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX]; const char *msg_type = "Mapping Info response"; u8 nl_client; char *iwpm_name; u16 iwpm_version; int ret = -EINVAL; if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX, resp_mapinfo_policy, nltb, msg_type)) { pr_info("%s: Unable to parse nlmsg\n", __func__); return ret; } iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); if (strcmp(iwpm_ulib_name, iwpm_name) || iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Invalid port mapper name = %s version = %u\n", __func__, iwpm_name, iwpm_version); return ret; } nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_user_pid = cb->nlh->nlmsg_pid; if (iwpm_ulib_version < IWPM_UABI_VERSION) pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...", __func__, iwpm_user_pid); if (!iwpm_mapinfo_available()) return 0; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid); return ret; } /* netlink attribute policy for the received mapping info ack */ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 }, [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 }, [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } }; /** * iwpm_ack_mapping_info_cb - Process the port mapper ack for * the provided local mapping info records * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX]; u32 mapinfo_send, mapinfo_ack; const char *msg_type = "Mapping Info Ack"; if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX, ack_mapinfo_policy, nltb, msg_type)) return -EINVAL; mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]); mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]); if (mapinfo_ack != mapinfo_send) pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n", __func__, mapinfo_send, mapinfo_ack); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); return 0; } /* netlink attribute policy for the received port mapper error message */ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 }, [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, }; /** * iwpm_mapping_error_cb - Process port mapper notification for error * * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) */ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct iwpm_nlmsg_request *nlmsg_request = NULL; int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); struct nlattr *nltb[IWPM_NLA_ERR_MAX]; u32 msg_seq; u16 err_code; const char *msg_type = "Mapping Error Msg"; if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX, map_error_policy, nltb, msg_type)) return -EINVAL; msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]); err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]); pr_info("%s: Received msg seq = %u err code = %u client = %d\n", __func__, msg_seq, err_code, nl_client); /* look for nlmsg_request */ nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { /* not all errors have associated requests */ pr_debug("Could not find matching req (seq = %u)\n", msg_seq); return 0; } atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); nlmsg_request->err_code = err_code; nlmsg_request->request_done = 1; /* always for found request */ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); barrier(); up(&nlmsg_request->sem); return 0; } /* netlink attribute policy for the received hello request */ static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } }; /** * iwpm_hello_cb - Process a hello message from iwpmd * * @skb: The socket buffer * @cb: Contains the received message (payload and netlink header) * * Using the received port mapper pid, send the kernel's abi_version * after adjusting it to support the iwpmd version. */ int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *nltb[IWPM_NLA_HELLO_MAX]; const char *msg_type = "Hello request"; u8 nl_client; u16 abi_version; int ret = -EINVAL; if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb, msg_type)) { pr_info("%s: Unable to parse nlmsg\n", __func__); return ret; } abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); pr_debug("Using ABI version %u\n", iwpm_ulib_version); iwpm_user_pid = cb->nlh->nlmsg_pid; ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version); return ret; }
11 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0 /****************************************************************************** * rtl871x_pwrctrl.c * * Copyright(c) 2007 - 2010 Realtek Corporation. All rights reserved. * Linux device driver for RTL8192SU * * Modifications for inclusion into the Linux staging tree are * Copyright(c) 2010 Larry Finger. All rights reserved. * * Contact information: * WLAN FAE <wlanfae@realtek.com> * Larry Finger <Larry.Finger@lwfinger.net> * ******************************************************************************/ #define _RTL871X_PWRCTRL_C_ #include "osdep_service.h" #include "drv_types.h" #include "osdep_intf.h" #define RTL8712_SDIO_LOCAL_BASE 0X10100000 #define SDIO_HCPWM (RTL8712_SDIO_LOCAL_BASE + 0x0081) void r8712_set_rpwm(struct _adapter *padapter, u8 val8) { u8 rpwm; struct pwrctrl_priv *pwrpriv = &padapter->pwrctrlpriv; if (pwrpriv->rpwm == val8) { if (pwrpriv->rpwm_retry == 0) return; } if (padapter->driver_stopped || padapter->surprise_removed) return; rpwm = val8 | pwrpriv->tog; switch (val8) { case PS_STATE_S1: pwrpriv->cpwm = val8; break; case PS_STATE_S2:/* only for USB normal powersave mode use, * temp mark some code. */ case PS_STATE_S3: case PS_STATE_S4: pwrpriv->cpwm = val8; break; default: break; } pwrpriv->rpwm_retry = 0; pwrpriv->rpwm = val8; r8712_write8(padapter, 0x1025FE58, rpwm); pwrpriv->tog += 0x80; } void r8712_set_ps_mode(struct _adapter *padapter, uint ps_mode, uint smart_ps) { struct pwrctrl_priv *pwrpriv = &padapter->pwrctrlpriv; if (ps_mode > PM_Card_Disable) return; /* if driver is in active state, we dont need set smart_ps.*/ if (ps_mode == PS_MODE_ACTIVE) smart_ps = 0; if ((pwrpriv->pwr_mode != ps_mode) || (pwrpriv->smart_ps != smart_ps)) { if (pwrpriv->pwr_mode == PS_MODE_ACTIVE) pwrpriv->bSleep = true; else pwrpriv->bSleep = false; pwrpriv->pwr_mode = ps_mode; pwrpriv->smart_ps = smart_ps; schedule_work(&pwrpriv->SetPSModeWorkItem); } } /* * Caller:ISR handler... * * This will be called when CPWM interrupt is up. * * using to update cpwn of drv; and drv will make a decision to up or * down pwr level */ void r8712_cpwm_int_hdl(struct _adapter *padapter, struct reportpwrstate_parm *preportpwrstate) { struct pwrctrl_priv *pwrpriv = &(padapter->pwrctrlpriv); struct cmd_priv *pcmdpriv = &(padapter->cmdpriv); if (pwrpriv->cpwm_tog == ((preportpwrstate->state) & 0x80)) return; del_timer(&padapter->pwrctrlpriv.rpwm_check_timer); mutex_lock(&pwrpriv->mutex_lock); pwrpriv->cpwm = (preportpwrstate->state) & 0xf; if (pwrpriv->cpwm >= PS_STATE_S2) { if (pwrpriv->alives & CMD_ALIVE) complete(&(pcmdpriv->cmd_queue_comp)); } pwrpriv->cpwm_tog = (preportpwrstate->state) & 0x80; mutex_unlock(&pwrpriv->mutex_lock); } static inline void register_task_alive(struct pwrctrl_priv *pwrctrl, uint tag) { pwrctrl->alives |= tag; } static inline void unregister_task_alive(struct pwrctrl_priv *pwrctrl, uint tag) { if (pwrctrl->alives & tag) pwrctrl->alives ^= tag; } static void _rpwm_check_handler (struct _adapter *padapter) { struct pwrctrl_priv *pwrpriv = &padapter->pwrctrlpriv; if (padapter->driver_stopped || padapter->surprise_removed) return; if (pwrpriv->cpwm != pwrpriv->rpwm) schedule_work(&pwrpriv->rpwm_workitem); } static void SetPSModeWorkItemCallback(struct work_struct *work) { struct pwrctrl_priv *pwrpriv = container_of(work, struct pwrctrl_priv, SetPSModeWorkItem); struct _adapter *padapter = container_of(pwrpriv, struct _adapter, pwrctrlpriv); if (!pwrpriv->bSleep) { mutex_lock(&pwrpriv->mutex_lock); if (pwrpriv->pwr_mode == PS_MODE_ACTIVE) r8712_set_rpwm(padapter, PS_STATE_S4); mutex_unlock(&pwrpriv->mutex_lock); } } static void rpwm_workitem_callback(struct work_struct *work) { struct pwrctrl_priv *pwrpriv = container_of(work, struct pwrctrl_priv, rpwm_workitem); struct _adapter *padapter = container_of(pwrpriv, struct _adapter, pwrctrlpriv); if (pwrpriv->cpwm != pwrpriv->rpwm) { mutex_lock(&pwrpriv->mutex_lock); r8712_read8(padapter, SDIO_HCPWM); pwrpriv->rpwm_retry = 1; r8712_set_rpwm(padapter, pwrpriv->rpwm); mutex_unlock(&pwrpriv->mutex_lock); } } static void rpwm_check_handler (struct timer_list *t) { struct _adapter *adapter = from_timer(adapter, t, pwrctrlpriv.rpwm_check_timer); _rpwm_check_handler(adapter); } void r8712_init_pwrctrl_priv(struct _adapter *padapter) { struct pwrctrl_priv *pwrctrlpriv = &padapter->pwrctrlpriv; memset((unsigned char *)pwrctrlpriv, 0, sizeof(struct pwrctrl_priv)); mutex_init(&pwrctrlpriv->mutex_lock); pwrctrlpriv->cpwm = PS_STATE_S4; pwrctrlpriv->pwr_mode = PS_MODE_ACTIVE; pwrctrlpriv->smart_ps = 0; pwrctrlpriv->tog = 0x80; /* clear RPWM to ensure driver and fw back to initial state. */ r8712_write8(padapter, 0x1025FE58, 0); INIT_WORK(&pwrctrlpriv->SetPSModeWorkItem, SetPSModeWorkItemCallback); INIT_WORK(&pwrctrlpriv->rpwm_workitem, rpwm_workitem_callback); timer_setup(&pwrctrlpriv->rpwm_check_timer, rpwm_check_handler, 0); } /* * Caller: r8712_cmd_thread * Check if the fw_pwrstate is okay for issuing cmd. * If not (cpwm should be is less than P2 state), then the sub-routine * will raise the cpwm to be greater than or equal to P2. * Calling Context: Passive * Return Value: * 0: r8712_cmd_thread can issue cmds to firmware afterwards. * -EINVAL: r8712_cmd_thread can not do anything. */ int r8712_register_cmd_alive(struct _adapter *padapter) { int res = 0; struct pwrctrl_priv *pwrctrl = &padapter->pwrctrlpriv; mutex_lock(&pwrctrl->mutex_lock); register_task_alive(pwrctrl, CMD_ALIVE); if (pwrctrl->cpwm < PS_STATE_S2) { r8712_set_rpwm(padapter, PS_STATE_S3); res = -EINVAL; } mutex_unlock(&pwrctrl->mutex_lock); return res; } /* * Caller: ISR * If ISR's txdone, * No more pkts for TX, * Then driver shall call this fun. to power down firmware again. */ void r8712_unregister_cmd_alive(struct _adapter *padapter) { struct pwrctrl_priv *pwrctrl = &padapter->pwrctrlpriv; mutex_lock(&pwrctrl->mutex_lock); unregister_task_alive(pwrctrl, CMD_ALIVE); if ((pwrctrl->cpwm > PS_STATE_S2) && (pwrctrl->pwr_mode > PS_MODE_ACTIVE)) { if ((pwrctrl->alives == 0) && (check_fwstate(&padapter->mlmepriv, _FW_UNDER_LINKING) != true)) { r8712_set_rpwm(padapter, PS_STATE_S0); } } mutex_unlock(&pwrctrl->mutex_lock); } void r8712_flush_rwctrl_works(struct _adapter *padapter) { struct pwrctrl_priv *pwrctrl = &padapter->pwrctrlpriv; flush_work(&pwrctrl->SetPSModeWorkItem); flush_work(&pwrctrl->rpwm_workitem); }
93 3 90 15310 15319 15319 16 15201 161 161 89 19 3 6 1 7 3 15 2 3 1 4 3 4 11 5 3 8 6 16 12 10 5 5 5 3 14 10 13 3 3 7 7 13 11 31 3 2 3 2 3 3 3 3 2 2 7 9 6 3 4 2 19 7 8 5 6 8 3 11 13 3 3 7 430 431 431 431 411 254 430 430 430 108 52 113 18 8 96 21 91 14 72 5 12 25 10 10 1 34 36 36 10 3 14 431 411 19 431 431 434 437 5 21 6 454 11 469 20 12 46 24 114 76 445 437 352 28 47 5 75 16 13 16 431 472 473 44 99 332 3 431 24 2 9 14 386 2 2 28 352 30 356 27 31 61 60 2 73 2 2 2 66 9 58 7 4 3 2 1 6 1 3 3 16 1 5 11 4 2 6 4 4 7 7 3 1 5 4 93 91 82 16 7 4 1 3 1 1 1 2 5 2 3 1 1 2 1 1 4 1 1 2 1 1 6 6 6 6 6 8 2 2 1 1 1 1 11 1 1 1 8 2 2 2 1 3 4 1 3 3 3 2 14 1 6 2 3 2 3 73 3 65 5 5 70 1 8 8 81 73 9 74 85 1 1 82 83 15 15 12 3 10 10 7 3 2 2 2 3 6 5 5 5 4 4 4 4 10 10 3 2 2 2 2 6 6 5 4 3 3 9 4 4 3 2 2 1 1 1 2 3 1 2 3 2 1 12 12 7 9 1 2 5 6 9 6 2 2 1 1 2 3 2 1 9 3 2 2 1 1 2 24 1 1 22 21 10 11 2 2 6 5 12 4 7 5 1 3 1 6 6 2 3 3 1 4 5 1 3 2 2 2 4 4 1 1 4 4 36 10 27 2 2 3 8 4 1 7 7 1 2 4 4 1 1 2 2 2 4 7 7 5 4 4 5 3 17 7 2 2 4 4 3 8 1 3 1 5 22 22 16 7 22 22 17 14 2 2 3 2 3 16 8 4 1 3 4 20 20 20 19 19 18 18 16 3 20 1 18 1 19 14 14 7 7 60 1 1 1 1 6 1 5 4 4 4 4 4 4 3 1 1 1 1 2 4 5 2 2 1 1 3 3 3 3 9 5 4 9 9 9 9 1 1 9 9 9 1 1 1 1 1 1 6 3 7 1 6 5 1 4 2 2 5 1 4 2 2 1 1 2 2 1 1 29 29 5 16 16 2 2 1 1199 25 4 4 1 2 1 1 4 10 55 3 3 1 1 1 1 1 1 1 1 1 12 12 19 125 2 1 3 6 1 112 1 109 3 1 1 2 1 2 1 1 1 1 1 1 1 1 1 94 156 2 2 2 7 3 1 5 1 1 2 2 1 2 1 3 2 1 1 1 1 1 1 4 2 2 3 1 3 2 1 3 1 3 2 2 2 1 1 1 1 1 1 2 1 1 1 1 3 1 1 7 1 1 1 1 1 1 1 82 98 2 5 1 3 2 7 10 8 2 2 3 2 2 1 1 1 1 1 1 1 1 3 1 56 2 79 2 1 1 1 2 1 3 1 1 1 1 1 1 1 61 1 34 1 1 1 3 1 1 1 1 1 1 1 2 2 17 42 2 1 3 1 2 2 3 2 1 2 1 1 1 1 1 17 1 21 3 18 1 66 3 4 4 1 1 2 7 3 44 2 35 2 33 41 1 1 1 2 2 9 1 1 1 1 1 2 2 2 2 15 11 2 2 1 6 379 40 265 121 136 9 11 1 4 22 79 128 86 1 41 73 73 13 28 34 6 58 130 69 35 30 64 61 7 27 2 94 2 2 79 72 5 11 66 5 61 118 240 6 14 6 12 5 8 194 194 174 127 51 173 1 4 17 1 1 5 5 1 4 47 48 154 1 1 153 22 80 10 28 132 1 1 80 3 1 47 1 2 6 111 1 1 4 3 20 20 3 3 3 1 2 74 53 52 104 24 78 9 83 107 1 1 66 6 2 1 31 64 42 1 41 17 14 19 16 14 1 1 1 1 152 78 12 1 1 6 2 1 1 3 3 1 1 1 2 2 1 16 1 1 1 2 3 1 1 1 2 2 1 50 27 1 3 1 1 1 1 2 3 62 63 6 1 1 85 62 1 4 3 1 1 1 77 14 1 4 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 1 38 22 5 22 22 30 9 4 1 2 1 7 2 1 1 3 17 1 1 2 2 1 10 37 1 1 21 4 3 1 4 4 6 17 20 1 4 2 5 3 3 11 2 1 7 1 77 1 1 72 3 65 1 2 2 62 31 5 3 5 4 3 2 4 4 111 101 3 380 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <linux/atomic.h> #include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> #include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/skmsg.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> #include <linux/btf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> #include <net/xdp.h> #include <net/mptcp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netkit.h> #include <linux/un.h> #include <net/xdp_sock_drv.h> #include "dev.h" /* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { if (in_compat_syscall()) { struct compat_sock_fprog f32; if (len != sizeof(f32)) return -EINVAL; if (copy_from_sockptr(&f32, src, sizeof(f32))) return -EFAULT; memset(dst, 0, sizeof(*dst)); dst->len = f32.len; dst->filter = compat_ptr(f32.filter); } else { if (len != sizeof(*dst)) return -EINVAL; if (copy_from_sockptr(dst, src, sizeof(*dst))) return -EFAULT; } return 0; } EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; err = security_sock_rcv_skb(sk, skb); if (err) return err; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(sk_filter_trim_cap); BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[a]; if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u8 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return *(u8 *)(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return tmp; } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return *(u8 *)ptr; } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be16 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return get_unaligned_be16(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be16_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be16(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be32 tmp, *ptr; const int len = sizeof(tmp); if (likely(offset >= 0)) { if (headlen - offset >= len) return get_unaligned_be32(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be32_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be32(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, offset); } static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; switch (skb_field) { case SKF_AD_MARK: BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, mark)); break; case SKF_AD_PKTTYPE: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); #endif break; case SKF_AD_QUEUE: BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; case SKF_AD_VLAN_TAG: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, vlan_all)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); break; } return insn - insn_buf; } static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); /* A = *(u16 *) (CTX + offsetof(protocol)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, protocol)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PKTTYPE: cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); *insn++ = BPF_EXIT_INSN(); if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, ifindex)); else *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, type)); break; case SKF_AD_OFF + SKF_AD_MARK: cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, hash)); break; case SKF_AD_OFF + SKF_AD_QUEUE: cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: cnt = convert_skb_access(SKF_AD_VLAN_TAG, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TPID: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, vlan_proto)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: case SKF_AD_OFF + SKF_AD_CPU: case SKF_AD_OFF + SKF_AD_RANDOM: /* arg1 = CTX */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); /* arg2 = A */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); /* arg3 = X */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); bpf_user_rnd_init_once(); break; } break; case SKF_AD_OFF + SKF_AD_ALU_XOR_X: /* A ^= X */ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); break; default: /* This is just a dummy call to avoid letting the compiler * evict __bpf_call_base() as an optimization. Placed here * where no-one bothers. */ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); return false; } *insnp = insn; return true; } static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) { const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); bool endian = BPF_SIZE(fp->code) == BPF_H || BPF_SIZE(fp->code) == BPF_W; bool indirect = BPF_MODE(fp->code) == BPF_IND; const int ip_align = NET_IP_ALIGN; struct bpf_insn *insn = *insnp; int offset = fp->k; if (!indirect && ((unaligned_ok && offset >= 0) || (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); if (offset) *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_TMP, 0); } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); } *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); if (fp->k) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); } switch (BPF_SIZE(fp->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); break; default: return false; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn = BPF_EXIT_INSN(); *insnp = insn; return true; } /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_prog *new_prog, int *new_len, bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } do_pass: new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); if (*seen_ld_abs) { /* For packet access in classic BPF, cache skb->data * in callee-saved BPF R8 and skb->len - skb->data_len * (headlen) in BPF R9. Since classic BPF is read-only * on CTX, we only need to cache it once. */ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), BPF_REG_D, BPF_REG_CTX, offsetof(struct sk_buff, data)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, offsetof(struct sk_buff, len)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, data_len)); *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_MOD | BPF_K: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_ABS | BPF_W: case BPF_LD | BPF_ABS | BPF_H: case BPF_LD | BPF_ABS | BPF_B: case BPF_LD | BPF_IND | BPF_W: case BPF_LD | BPF_IND | BPF_H: case BPF_LD | BPF_IND | BPF_B: /* Check for overloaded BPF extension and * directly convert it if found, otherwise * just move on with mapping. */ if (BPF_CLASS(fp->code) == BPF_LD && BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; if (BPF_CLASS(fp->code) == BPF_LD && convert_bpf_ld_abs(fp, &insn)) { *seen_ld_abs = true; break; } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); /* Error with exception code on div/mod by 0. * For cBPF programs, this was always return 0. */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn++ = BPF_EXIT_INSN(); } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; /* Jump transformation cannot use BPF block macros * everywhere as offset calculation and target updates * require a bit more work than the rest, i.e. jump * opcodes map as-is, but offsets need adjustment. */ #define BPF_EMIT_JMP \ do { \ const s32 off_min = S16_MIN, off_max = S16_MAX; \ s32 off; \ \ if (target >= len || target < 0) \ goto err; \ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ off -= insn - tmp_insns; \ /* Reject anything not fitting into insn->off. */ \ if (off < off_min || off > off_max) \ goto err; \ insn->off = off; \ } while (0) case BPF_JMP | BPF_JA: target = i + fp->k + 1; insn->code = fp->code; BPF_EMIT_JMP; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { /* BPF immediates are signed, zero extend * immediate into tmp register and use it * in compare insn. */ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); insn->dst_reg = BPF_REG_A; insn->src_reg = BPF_REG_TMP; bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ if (fp->jf == 0) { insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; target = i + fp->jt + 1; BPF_EMIT_JMP; break; } /* Convert some jumps when 'jump_true' is next insn. */ if (fp->jt == 0) { switch (BPF_OP(fp->code)) { case BPF_JEQ: insn->code = BPF_JMP | BPF_JNE | bpf_src; break; case BPF_JGT: insn->code = BPF_JMP | BPF_JLE | bpf_src; break; case BPF_JGE: insn->code = BPF_JMP | BPF_JLT | bpf_src; break; default: goto jmp_rest; } target = i + fp->jf + 1; BPF_EMIT_JMP; break; } jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; BPF_EMIT_JMP; insn++; insn->code = BPF_JMP | BPF_JA; target = i + fp->jf + 1; BPF_EMIT_JMP; break; /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, .k = fp->k, }; *seen_ld_abs = true; /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ convert_bpf_ld_abs(&tmp, &insn); insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); /* tmp = X */ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: if (BPF_RVAL(fp->code) == BPF_K) *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 0, fp->k); *insn = BPF_EXIT_INSN(); break; /* Store to stack. */ case BPF_ST: case BPF_STX: stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, -stack_off); /* check_load_and_stores() verifies that classic BPF can * load from stack only after write, so tracking * stack_depth for ST|STX insns is enough */ if (new_prog && new_prog->aux->stack_depth < stack_off) new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, -stack_off); break; /* A = K or X = K */ case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, fp->k); break; /* X = A */ case BPF_MISC | BPF_TAX: *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); break; /* A = X */ case BPF_MISC | BPF_TXA: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); break; /* A = skb->len or X = skb->len */ case BPF_LD | BPF_W | BPF_LEN: case BPF_LDX | BPF_W | BPF_LEN: *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_CTX, offsetof(struct sk_buff, len)); break; /* Access seccomp_data fields. */ case BPF_LDX | BPF_ABS | BPF_W: /* A = *(u32 *) (ctx + K) */ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); break; /* Unknown instruction. */ default: goto err; } insn++; if (new_prog) memcpy(new_insn, tmp_insns, sizeof(*insn) * (insn - tmp_insns)); new_insn += insn - tmp_insns; } if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; if (*seen_ld_abs) *new_len += 4; /* Prologue bits. */ return 0; } pass++; if (new_flen != new_insn - first_insn) { new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; } kfree(addrs); BUG_ON(*new_len != new_flen); return 0; err: kfree(addrs); return -EINVAL; } /* Security: * * As we dont want to clear mem[] array for each packet going through * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { case BPF_ST: case BPF_STX: memvalid |= (1 << filter[pc].k); break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; case BPF_JMP | BPF_JA: /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; break; } } error: kfree(masks); return ret; } static bool chk_code_allowed(u16 code_to_probe) { static const bool codes[] = { /* 32 bit ALU operations */ [BPF_ALU | BPF_ADD | BPF_K] = true, [BPF_ALU | BPF_ADD | BPF_X] = true, [BPF_ALU | BPF_SUB | BPF_K] = true, [BPF_ALU | BPF_SUB | BPF_X] = true, [BPF_ALU | BPF_MUL | BPF_K] = true, [BPF_ALU | BPF_MUL | BPF_X] = true, [BPF_ALU | BPF_DIV | BPF_K] = true, [BPF_ALU | BPF_DIV | BPF_X] = true, [BPF_ALU | BPF_MOD | BPF_K] = true, [BPF_ALU | BPF_MOD | BPF_X] = true, [BPF_ALU | BPF_AND | BPF_K] = true, [BPF_ALU | BPF_AND | BPF_X] = true, [BPF_ALU | BPF_OR | BPF_K] = true, [BPF_ALU | BPF_OR | BPF_X] = true, [BPF_ALU | BPF_XOR | BPF_K] = true, [BPF_ALU | BPF_XOR | BPF_X] = true, [BPF_ALU | BPF_LSH | BPF_K] = true, [BPF_ALU | BPF_LSH | BPF_X] = true, [BPF_ALU | BPF_RSH | BPF_K] = true, [BPF_ALU | BPF_RSH | BPF_X] = true, [BPF_ALU | BPF_NEG] = true, /* Load instructions */ [BPF_LD | BPF_W | BPF_ABS] = true, [BPF_LD | BPF_H | BPF_ABS] = true, [BPF_LD | BPF_B | BPF_ABS] = true, [BPF_LD | BPF_W | BPF_LEN] = true, [BPF_LD | BPF_W | BPF_IND] = true, [BPF_LD | BPF_H | BPF_IND] = true, [BPF_LD | BPF_B | BPF_IND] = true, [BPF_LD | BPF_IMM] = true, [BPF_LD | BPF_MEM] = true, [BPF_LDX | BPF_W | BPF_LEN] = true, [BPF_LDX | BPF_B | BPF_MSH] = true, [BPF_LDX | BPF_IMM] = true, [BPF_LDX | BPF_MEM] = true, /* Store instructions */ [BPF_ST] = true, [BPF_STX] = true, /* Misc instructions */ [BPF_MISC | BPF_TAX] = true, [BPF_MISC | BPF_TXA] = true, /* Return instructions */ [BPF_RET | BPF_K] = true, [BPF_RET | BPF_A] = true, /* Jump instructions */ [BPF_JMP | BPF_JA] = true, [BPF_JMP | BPF_JEQ | BPF_K] = true, [BPF_JMP | BPF_JEQ | BPF_X] = true, [BPF_JMP | BPF_JGE | BPF_K] = true, [BPF_JMP | BPF_JGE | BPF_X] = true, [BPF_JMP | BPF_JGT | BPF_K] = true, [BPF_JMP | BPF_JGT | BPF_X] = true, [BPF_JMP | BPF_JSET | BPF_K] = true, [BPF_JMP | BPF_JSET | BPF_X] = true, }; if (code_to_probe >= ARRAY_SIZE(codes)) return false; return codes[code_to_probe]; } static bool bpf_check_basics_ok(const struct sock_filter *filter, unsigned int flen) { if (filter == NULL) return false; if (flen == 0 || flen > BPF_MAXINSNS) return false; return true; } /** * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain * no references or jumps that are out of range, no illegal * instructions, and must end with a RET instruction. * * All jumps are forward as they are not signed. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) return -EINVAL; /* Some instructions need special checks */ switch (ftest->code) { case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; break; case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: if (ftest->k >= 32) return -EINVAL; break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; case BPF_JMP | BPF_JA: /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: anc_found = false; if (bpf_anc_helper(ftest) & BPF_ANC) anc_found = true; /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } } /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { case BPF_RET | BPF_K: case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } return -EINVAL; } static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); if (!fp->orig_prog) return -ENOMEM; fkprog = fp->orig_prog; fkprog->len = fprog->len; fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; } return 0; } static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; if (fprog) { kfree(fprog->filter); kfree(fprog); } } static void __bpf_prog_release(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); bpf_prog_free(prog); } } static void __sk_filter_release(struct sk_filter *fp) { __bpf_prog_release(fp->prog); kfree(fp); } /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); __sk_filter_release(fp); } /** * sk_filter_release - release a socket filter * @fp: filter to remove * * Remove a filter from a socket and release its resources. */ static void sk_filter_release(struct sk_filter *fp) { if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); } /* try to charge the socket memory if there is space available * return true on success */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { if (!refcount_inc_not_zero(&fp->refcnt)) return false; if (!__sk_filter_charge(sk, fp)) { sk_filter_release(fp); return false; } return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it won't be used at * this point in time anymore internally after the migration to the eBPF * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; } /* 1st pass: calculate the new program length. */ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs); if (err) goto out_err_free; /* Expand fp for appending the new filter representation. */ old_fp = fp; fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. */ fp = old_fp; err = -ENOMEM; goto out_err_free; } fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = bpf_convert_filter(old_prog, old_len, fp, &new_len, &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released * by krealloc(). */ goto out_err_free; fp = bpf_prog_select_runtime(fp, &err); if (err) goto out_err_free; kfree(old_prog); return fp; out_err_free: kfree(old_prog); out_err: __bpf_prog_release(fp); return ERR_PTR(err); } static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans) { int err; fp->bpf_func = NULL; fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } /* There might be additional checks and transformations * needed on classic filters, f.e. in case of seccomp. */ if (trans) { err = trans(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } } /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ bpf_jit_compile(fp); /* JIT compiler couldn't process this filter, so do the eBPF translation * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); return fp; } /** * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold * a copy here, and can spare us the work. */ fp->orig_prog = NULL; /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create); /** * bpf_prog_create_from_user - create an unattached filter from user buffer * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; int err; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { __bpf_prog_free(fp); return -EFAULT; } fp->len = fprog->len; fp->orig_prog = NULL; if (save_orig) { err = bpf_prog_store_orig_filter(fp, fprog); if (err) { __bpf_prog_free(fp); return -ENOMEM; } } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, trans); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); } EXPORT_SYMBOL_GPL(bpf_prog_destroy); static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; fp = kmalloc(sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); return ERR_PTR(-EFAULT); } prog->len = fprog->len; err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ return bpf_prepare_filter(prog, NULL); } /** * sk_attach_filter - attach a socket filter * @fprog: the filter program * @sk: the socket to use * * Attach the user's filter code. We first run some sanity checks on * it to make sure it does not explode on us later. If an error * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; } return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); if (err) __bpf_prog_release(prog); return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0; } int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); if (PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { /* Like other non BPF_PROG_TYPE_SOCKET_FILTER * bpf prog (e.g. sockmap). It depends on the * limitation imposed by bpf_prog_load(). * Hence, sysctl_optmem_max is not checked. */ if ((sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) || (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_TCP) || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { err = -ENOTSUPP; goto err_prog_put; } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } } err = reuseport_attach_prog(sk, prog); err_prog_put: if (err) bpf_prog_put(prog); return err; } void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) bpf_prog_put(prog); else bpf_prog_destroy(prog); } struct bpf_scratchpad { union { __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; u8 buff[MAX_BPF_STACK]; }; local_lock_t bh_lock; }; static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { #ifdef CONFIG_DEBUG_NET /* Avoid a splat in pskb_may_pull_reason() */ if (write_len > INT_MAX) return -EINVAL; #endif return skb_ensure_writable(skb, write_len); } static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { int err = __bpf_try_make_writable(skb, write_len); bpf_compute_data_pointers(skb); return err; } static int bpf_try_make_head_writable(struct sk_buff *skb) { return bpf_try_make_writable(skb, skb_headlen(skb)); } static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); } static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len, u64, flags) { void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb->data + offset; if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpush_rcsum(skb, ptr, len, offset); if (flags & BPF_F_INVALIDATE_HASH) skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return ____bpf_skb_store_bytes(skb, offset, from, len, flags); } BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return ____bpf_skb_load_bytes(skb, offset, to, len); } BPF_CALL_4(bpf_flow_dissector_load_bytes, const struct bpf_flow_dissector *, ctx, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > 0xffff)) goto err_clear; if (unlikely(!ctx->skb)) goto err_clear; ptr = skb_header_pointer(ctx->skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { .func = bpf_flow_dissector_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); u8 *start, *ptr; if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: if (unlikely(!skb_mac_header_was_set(skb))) goto err_clear; start = skb_mac_header(skb); break; case BPF_HDR_START_NET: start = skb_network_header(skb); break; default: goto err_clear; } ptr = start + offset; if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { .func = bpf_skb_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto bpf_skb_pull_data_proto = { .func = bpf_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) { return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; } static const struct bpf_func_proto bpf_sk_fullsock_proto = { .func = bpf_sk_fullsock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto sk_skb_pull_data_proto = { .func = sk_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; csum_replace_by_diff(ptr, to); break; case 2: csum_replace2(ptr, from, to); break; case 4: csum_replace4(ptr, from, to); break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; case 4: inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); break; default: return -EINVAL; } if (is_mmzero && !*ptr) *ptr = CSUM_MANGLED_0; return 0; } static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); u32 diff_size = from_size + to_size; int i, j = 0; __wsum ret; /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data * from_size > 0, to_size == 0, seed := csum --> pulling data * from_size > 0, to_size > 0, seed := 0 --> diffing data * * Even for diffing, from_size and to_size don't need to be equal. */ if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || diff_size > sizeof(sp->diff))) return -EINVAL; local_lock_nested_bh(&bpf_sp.bh_lock); for (i = 0; i < from_size / sizeof(__be32); i++, j++) sp->diff[j] = ~from[i]; for (i = 0; i < to_size / sizeof(__be32); i++, j++) sp->diff[j] = to[i]; ret = csum_partial(sp->diff, diff_size, seed); local_unlock_nested_bh(&bpf_sp.bh_lock); return ret; } static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) { /* The interface is to be used in combination with bpf_csum_diff() * for direct packet writes. csum rotation for alignment as well * as emulating csum_sub() can be done from the eBPF program. */ if (skb->ip_summed == CHECKSUM_COMPLETE) return (skb->csum = csum_add(skb->csum, csum)); return -ENOTSUPP; } static const struct bpf_func_proto bpf_csum_update_proto = { .func = bpf_csum_update, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) { /* The interface is to be used in combination with bpf_skb_adjust_room() * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET * is passed as flags, for example. */ switch (level) { case BPF_CSUM_LEVEL_INC: __skb_incr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_DEC: __skb_decr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_RESET: __skb_reset_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_QUERY: return skb->ip_summed == CHECKSUM_UNNECESSARY ? skb->csum_level : -EACCES; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_csum_level_proto = { .func = bpf_csum_level, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; ret = netif_rx(skb); } return ret; } static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; } skb->dev = dev; skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); dev_xmit_recursion_dec(); return ret; } static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, u32 flags) { unsigned int mlen = skb_network_offset(skb); if (unlikely(skb->len <= mlen)) { kfree_skb(skb); return -ERANGE; } if (mlen) { __skb_pull(skb, mlen); /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that * the skb is originated from ingress (i.e. a forwarded skb) * to ensure that rcsum starts at net header. */ if (!skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); } skb_pop_mac_header(skb); skb_reset_mac_len(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); else return __bpf_redirect_no_mac(skb, dev, flags); } #if IS_ENABLED(CONFIG_IPV6) static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { dst = skb_dst(skb); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, false); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock_bh(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowi6_mark = skb->mark, .flowlabel = ip6_flowinfo(ip6h), .flowi6_oif = dev->ifindex, .flowi6_proto = ip6h->nexthdr, .daddr = ip6h->daddr, .saddr = ip6h->saddr, }; dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) goto out_drop; skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; } err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); is_v6gw = true; } else if (nh->nh_family == AF_INET) { neigh = ip_neigh_gw4(dev, nh->ipv4_nh); } else { rcu_read_unlock(); goto out_drop; } if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, is_v6gw); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .flowi4_tos = RT_TOS(ip4h->tos), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, .saddr = ip4h->saddr, }; struct rtable *rt; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto out_drop; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto out_drop; } skb_dst_set(skb, &rt->dst); } err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); if (unlikely(skb->mac_header >= skb->network_header)) goto out; bpf_push_mac_rcsum(skb); if (is_multicast_ether_addr(ethh->h_dest)) goto out; skb_pull(skb, sizeof(*ethh)); skb_unset_mac_header(skb); skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; } /* Internal, non-exposed redirect flags. */ enum { BPF_F_NEIGH = (1ULL << 1), BPF_F_PEER = (1ULL << 2), BPF_F_NEXTHOP = (1ULL << 3), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; clone = skb_clone(skb, GFP_ATOMIC); if (unlikely(!clone)) return -ENOMEM; /* For direct write, we need to keep the invariant that the skbs * we're dealing with need to be uncloned. Should uncloning fail * here, we need to free the just generated clone to unclone once * again. */ ret = bpf_try_make_head_writable(skb); if (unlikely(ret)) { kfree_skb(clone); return -ENOMEM; } return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; if (likely(ops->ndo_get_peer_dev)) return INDIRECT_CALL_1(ops->ndo_get_peer_dev, netkit_peer_dev, dev); return NULL; } int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; ri->flags = 0; if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; ri->flags = flags; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; ri->flags = BPF_F_PEER; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_peer_proto = { .func = bpf_redirect_peer, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); if (plen) memcpy(&ri->nh, params, sizeof(ri->nh)); return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_neigh_proto = { .func = bpf_redirect_neigh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; } static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .func = bpf_msg_apply_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; } static void sk_msg_reset_curr(struct sk_msg *msg) { u32 i = msg->sg.start; u32 len = 0; do { len += sk_msg_elem(msg, i)->length; sk_msg_iter_var_next(i); if (len >= msg->sg.size) break; } while (i != msg->sg.end); msg->sg.curr = i; msg->sg.copybreak = 0; } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, u32, end, u64, flags) { u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; u32 first_sge, last_sge, i, shift, bytes_sg_total; struct scatterlist *sge; u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then * place the buffer in the scatterlist and fixup the original * entries by removing the entries now in the linear buffer * and shifting the remaining entries. For now we do not try * to copy partial entries to avoid complexity of running out * of sg_entry slots. The downside is reading a single byte * will copy the entire sg entry. */ do { copy += sk_msg_elem(msg, i)->length; sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg.end); last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy)); if (unlikely(!page)) return -ENOMEM; raw = page_address(page); i = first_sge; do { sge = sk_msg_elem(msg, i); from = sg_virt(sge); len = sge->length; to = raw + poffset; memcpy(to, from, len); poffset += len; sge->length = 0; put_page(sg_page(sge)); sk_msg_iter_var_next(i); } while (i != last_sge); sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; i = first_sge; sk_msg_iter_var_next(i); do { u32 move_from; if (i + shift >= NR_MSG_FRAG_IDS) move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) break; msg->sg.data[i] = msg->sg.data[move_from]; msg->sg.data[move_from].length = 0; msg->sg.data[move_from].page_link = 0; msg->sg.data[move_from].offset = 0; sk_msg_iter_var_next(i); } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; } static const struct bpf_func_proto bpf_msg_pull_data_proto = { .func = bpf_msg_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (start >= offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); /* If no space available will fallback to copy, we need at * least one scatterlist elem available to push data into * when start aligns to the beginning of an element or two * when it falls inside an element. We handle the start equals * offset case because its the common case for inserting a * header. */ if (!space || (space == 1 && start != offset)) copy = msg->sg.data[i].length; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy + len)); if (unlikely(!page)) return -ENOMEM; if (copy) { int front, back; raw = page_address(page); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; from = sg_virt(psge); if (front) memcpy(raw, from, front); if (back) { from += front; to = raw + front + len; memcpy(to, from, back); } put_page(sg_page(psge)); } else if (start - offset) { psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); psge->length = start - offset; rsge.length -= psge->length; rsge.offset += start; sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); sk_msg_iter_next(msg, end); } /* Slot(s) to place newly allocated data */ new = i; /* Shift one or two slots as needed */ if (!copy) { sge = sk_msg_elem_cpy(msg, i); sk_msg_iter_var_next(i); sg_unmark_end(&sge); sk_msg_iter_next(msg, end); nsge = sk_msg_elem_cpy(msg, i); if (rsge.length) { sk_msg_iter_var_next(i); nnsge = sk_msg_elem_cpy(msg, i); } while (i != msg->sg.end) { msg->sg.data[i] = sge; sge = nsge; sk_msg_iter_var_next(i); if (rsge.length) { nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); } else { nsge = sk_msg_elem_cpy(msg, i); } } } /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); sk_msg_iter_var_next(new); msg->sg.data[new] = rsge; } sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_push_data_proto = { .func = bpf_msg_push_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static void sk_msg_shift_left(struct sk_msg *msg, int i) { int prev; do { prev = i; sk_msg_iter_var_next(i); msg->sg.data[prev] = msg->sg.data[i]; } while (i != msg->sg.end); sk_msg_iter_prev(msg, end); } static void sk_msg_shift_right(struct sk_msg *msg, int i) { struct scatterlist tmp, sge; sk_msg_iter_next(msg, end); sge = sk_msg_elem_cpy(msg, i); sk_msg_iter_var_next(i); tmp = sk_msg_elem_cpy(msg, i); while (i != msg->sg.end) { msg->sg.data[i] = sge; sk_msg_iter_var_next(i); sge = tmp; tmp = sk_msg_elem_cpy(msg, i); } } BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; if (unlikely(flags)) return -EINVAL; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ if (start >= offset + l || last >= msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); pop = len; /* --------------| offset * -| start |-------- len -------| * * |----- a ----|-------- pop -------|----- b ----| * |______________________________________________| length * * * a: region at front of scatter element to save * b: region at back of scatter element to save when length > A + pop * pop: region to pop from element, same as input 'pop' here will be * decremented below per iteration. * * Two top-level cases to handle when start != offset, first B is non * zero and second B is zero corresponding to when a pop includes more * than one element. * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); int a = start; int b = sge->length - pop - a; sk_msg_iter_var_next(i); if (pop < sge->length - a) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); nsge = sk_msg_elem(msg, i); get_page(sg_page(sge)); sg_set_page(nsge, sg_page(sge), b, sge->offset + pop + a); } else { struct page *page, *orig; u8 *to, *from; page = alloc_pages(__GFP_NOWARN | __GFP_COMP | GFP_ATOMIC, get_order(a + b)); if (unlikely(!page)) return -ENOMEM; sge->length = a; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); memcpy(to, from, a); memcpy(to + a, from + a + pop, b); sg_set_page(sge, page, a + b, 0); put_page(orig); } pop = 0; } else if (pop >= sge->length - a) { pop -= (sge->length - a); sge->length = a; } } /* From above the current layout _must_ be as follows, * * -| offset * -| start * * |---- pop ---|---------------- b ------------| * |____________________________________________| length * * Offset and start of the current msg elem are equal because in the * previous case we handled offset != start and either consumed the * entire element and advanced to the next element OR pop == 0. * * Two cases to handle here are first pop is less than the length * leaving some remainder b above. Simply adjust the element's layout * in this case. Or pop >= length of the element so that b = 0. In this * case advance to next element decrementing pop. */ while (pop) { struct scatterlist *sge = sk_msg_elem(msg, i); if (pop < sge->length) { sge->length -= pop; sge->offset += pop; pop = 0; } else { pop -= sge->length; sk_msg_shift_left(msg, i); } sk_msg_iter_var_next(i); } sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_pop_data_proto = { .func = bpf_msg_pop_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; #ifdef CONFIG_CGROUP_NET_CLASSID BPF_CALL_0(bpf_get_cgroup_classid_curr) { return __task_get_classid(current); } const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return 0; return sock_cgroup_classid(&sk->sk_cgrp_data); } static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { .func = bpf_skb_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .func = bpf_get_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { .func = bpf_get_route_realm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .func = bpf_get_hash_recalc, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) { /* After all direct packet write, this can be used once for * triggering a lazy recalc on next skb_get_hash() invocation. */ skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .func = bpf_set_hash_invalid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) { /* Set user specified hash as L4(+), so that it gets returned * on skb_get_hash() call unless BPF prog later on triggers a * skb_clear_hash(). */ __skb_set_sw_hash(skb, hash, true); return 0; } static const struct bpf_func_proto bpf_set_hash_proto = { .func = bpf_set_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { int ret; bpf_push_mac_rcsum(skb); ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with len as headroom, * so no need to do it here. */ skb_push(skb, len); memmove(skb->data, skb->data + len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) * needed here as it does not change the skb->csum * result for checksum complete when summing over * zeroed blocks. */ return 0; } static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { void *old_data; /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); memmove(skb->data, old_data, off); return 0; } static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* There's no need for __skb_push()/__skb_pull() pair to * get to the start of the mac header as we're guaranteed * to always start from here under eBPF. */ ret = bpf_skb_generic_push(skb, off, len); if (likely(!ret)) { skb->mac_header -= len; skb->network_header -= len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* Same here, __skb_push()/__skb_pull() pair not needed. */ ret = bpf_skb_generic_pop(skb, off, len); if (likely(!ret)) { skb->mac_header += len; skb->network_header += len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } } skb->protocol = htons(ETH_P_IPV6); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } } skb->protocol = htons(ETH_P_IP); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, u64, flags) { int ret; if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork * needed for changing the protocol, and eBPF program fills the * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() * and other helpers, rather than passing a raw buffer here. * * The rationale is to keep this minimal and without a need to * deal with raw packet data. F.e. even if we would pass buffers * here, the program still needs to call the bpf_lX_csum_replace() * helpers anyway. Plus, this way we keep also separation of * concerns, since f.e. bpf_skb_store_bytes() should only take * care of stores. * * Currently, additional options and extension header space are * not supported, but flags register is reserved so we can adapt * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_proto_proto = { .func = bpf_skb_change_proto, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; return 0; } static const struct bpf_func_proto bpf_skb_change_type_proto = { .func = bpf_skb_change_type, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static u32 bpf_skb_net_base_len(const struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return sizeof(struct iphdr); case htons(ETH_P_IPV6): return sizeof(struct ipv6hdr); default: return ~0U; } } #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_DECAP_L3_IPV6) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -ENOTSUPP; if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && inner_mac_len < ETH_HLEN) return -EINVAL; if (skb->encapsulation) return -EALREADY; mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; if (inner_mac_len > len_diff) return -EINVAL; inner_trans = skb->transport_header; } ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) skb_set_inner_protocol(skb, htons(ETH_P_TEB)); else skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) gso_type |= SKB_GSO_UDP_TUNNEL; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); skb_set_transport_header(skb, mac_len + nh_len); } /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; /* Due to header growth, MSS needs to be downgraded. * There is a BUG_ON() when segmenting the frag_list with * head_frag true, so linearize the skb after downgrading * the MSS. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { skb_decrease_gso_size(shinfo, len_diff); if (shinfo->frag_list) return skb_linearize(skb); } } return 0; } static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } return 0; } #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_diff_abs = abs(len_diff); bool shrink = len_diff < 0; int ret = 0; if (unlikely(flags || mode)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (!shrink) { ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; __skb_push(skb, len_diff_abs); memset(skb->data, 0, len_diff_abs); } else { if (unlikely(!pskb_may_pull(skb, len_diff_abs))) return -ENOMEM; __skb_pull(skb, len_diff_abs); } if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); rxm->full_len += len_diff; } return ret; } static const struct bpf_func_proto sk_skb_adjust_room_proto = { .func = sk_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; off = skb_mac_header_len(skb); switch (mode) { case BPF_ADJ_ROOM_NET: off += bpf_skb_net_base_len(skb); break; case BPF_ADJ_ROOM_MAC: break; default: return -ENOTSUPP; } if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { if (!shrink) return -EINVAL; switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: len_min = sizeof(struct iphdr); break; case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: len_min = sizeof(struct ipv6hdr); break; default: return -EINVAL; } } len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && !skb_is_gso(skb)))) return -ENOTSUPP; ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static u32 __bpf_skb_min_len(const struct sk_buff *skb) { u32 min_len = skb_network_offset(skb); if (skb_transport_header_was_set(skb)) min_len = skb_transport_offset(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) min_len = skb_checksum_start_offset(skb) + skb->csum_offset + sizeof(__sum16); return min_len; } static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; int ret; ret = __skb_grow_rcsum(skb, new_len); if (!ret) memset(skb->data + old_len, 0, new_len - old_len); return ret; } static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) { return __skb_trim_rcsum(skb, new_len); } static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) return -EINVAL; if (skb->encapsulation) return -ENOTSUPP; /* The basic idea of this helper is that it's performing the * needed work to either grow or trim an skb, and eBPF program * rewrites the rest via helpers like bpf_skb_store_bytes(), * bpf_lX_csum_replace() and others rather than passing a raw * buffer here. This one is a slow path helper and intended * for replies with control messages. * * Like in bpf_skb_change_proto(), we want to keep this rather * minimal and without protocol specifics so that we are able * to separate concerns as in bpf_skb_store_bytes() should only * be the one responsible for writing buffers. * * It's really expected to be a slow path operation here for * control message replies, so we're implicitly linearizing, * uncloning and drop offloads from the skb by this. */ ret = __bpf_try_make_writable(skb, skb->len); if (!ret) { if (new_len > skb->len) ret = bpf_skb_grow_rcsum(skb, new_len); else if (new_len < skb->len) ret = bpf_skb_trim_rcsum(skb, new_len); if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } return ret; } BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_tail_proto = { .func = bpf_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { .func = sk_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; ret = skb_cow(skb, head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that * skb->protocol network header, etc, stay as is. * Compared to bpf_skb_change_tail(), we're more * flexible due to not needing to linearize or * reset GSO. Intention for this helper is to be * used by an L3 skb that needs to push mac header * for redirection into L2 device. */ __skb_push(skb, head_room); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); } return ret; } BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { int ret = __bpf_skb_change_head(skb, head_room, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { .func = bpf_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { .func = sk_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) { return xdp_get_buff_len(xdp); } static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], }; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; } BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); xdp->data_meta += offset; xdp->data = data; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .func = bpf_xdp_adjust_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { unsigned long ptr_len, ptr_off = 0; skb_frag_t *next_frag, *end_frag; struct skb_shared_info *sinfo; void *src, *dst; u8 *ptr_buf; if (likely(xdp->data_end - xdp->data >= off + len)) { src = flush ? buf : xdp->data + off; dst = flush ? xdp->data + off : buf; memcpy(dst, src, len); return; } sinfo = xdp_get_shared_info_from_buff(xdp); end_frag = &sinfo->frags[sinfo->nr_frags]; next_frag = &sinfo->frags[0]; ptr_len = xdp->data_end - xdp->data; ptr_buf = xdp->data; while (true) { if (off < ptr_off + ptr_len) { unsigned long copy_off = off - ptr_off; unsigned long copy_len = min(len, ptr_len - copy_off); src = flush ? buf : ptr_buf + copy_off; dst = flush ? ptr_buf + copy_off : buf; memcpy(dst, src, copy_len); off += copy_len; len -= copy_len; buf += copy_len; } if (!len || next_frag == end_frag) break; ptr_off += ptr_len; ptr_buf = skb_frag_address(next_frag); ptr_len = skb_frag_size(next_frag); next_frag++; } } void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { u32 size = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; void *addr = xdp->data; int i; if (unlikely(offset > 0xffff || len > 0xffff)) return ERR_PTR(-EFAULT); if (unlikely(offset + len > xdp_get_buff_len(xdp))) return ERR_PTR(-EINVAL); if (likely(offset < size)) /* linear area */ goto out; sinfo = xdp_get_shared_info_from_buff(xdp); offset -= size; for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ u32 frag_size = skb_frag_size(&sinfo->frags[i]); if (offset < frag_size) { addr = skb_frag_address(&sinfo->frags[i]); size = frag_size; break; } offset -= frag_size; } out: return offset + len <= size ? addr + offset : NULL; } BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, false); else memcpy(buf, ptr, len); return 0; } static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { .func = bpf_xdp_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_load_bytes(xdp, offset, buf, len); } BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, true); else memcpy(ptr, buf, len); return 0; } static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .func = bpf_xdp_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_store_bytes(xdp, offset, buf, len); } static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; unsigned int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); if (unlikely(offset > tailroom)) return -EINVAL; memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) xsk_buff_get_tail(xdp)->data_end += offset; return 0; } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, struct xdp_mem_info *mem_info, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); __xdp_return(NULL, mem_info, false, zc_frag); } else { zc_frag->data_end -= shrink; } } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { struct xdp_mem_info *mem_info = &xdp->rxq->mem; bool release = skb_frag_size(frag) == shrink; if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); goto out; } if (release) { struct page *page = skb_frag_page(frag); __xdp_return(page_address(page), mem_info, false, NULL); } out: return release; } static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i, n_frags_free = 0, len_free = 0; if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) return -EINVAL; for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { skb_frag_t *frag = &sinfo->frags[i]; int shrink = min_t(int, offset, skb_frag_size(frag)); len_free += shrink; offset -= shrink; if (bpf_xdp_shrink_data(xdp, frag, shrink)) { n_frags_free++; } else { skb_frag_size_sub(frag, shrink); break; } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); xdp->data_end -= offset; } return 0; } BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ if (offset < 0) return bpf_xdp_frags_shrink_tail(xdp, -offset); return bpf_xdp_frags_increase_tail(xdp, offset); } /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; /* Clear memory area on grow, can contain uninit kernel memory */ if (offset > 0) memset(xdp->data_end, 0, offset); xdp->data_end = data_end; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { .func = bpf_xdp_adjust_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .func = bpf_xdp_adjust_meta, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; /** * DOC: xdp redirect * * XDP_REDIRECT works by a three-step process, implemented in the functions * below: * * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target * of the redirect and store it (along with some other metadata) in a per-CPU * struct bpf_redirect_info. * * 2. When the program returns the XDP_REDIRECT return code, the driver will * call xdp_do_redirect() which will use the information in struct * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * * 3. Before exiting its NAPI poll loop, the driver will call * xdp_do_flush(), which will flush all the different bulk queues, * thus completing the redirect. Note that xdp_do_flush() must be * called before napi_complete_done() in the driver, as the * XDP_REDIRECT logic relies on being inside a single NAPI instance * through to the xdp_do_flush() call for RCU protection of all * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of * steps, protected by RCU. However, there is no top-level rcu_read_lock() in * the core code; instead, the RCU protection relies on everything happening * inside a single NAPI poll sequence, which means it's between a pair of calls * to local_bh_disable()/local_bh_enable(). * * The map entries are marked as __rcu and the map code makes sure to * dereference those pointers with rcu_dereference_check() in a way that works * for both sections that to hold an rcu_read_lock() and sections that are * called from NAPI without a separate rcu_read_lock(). The code below does not * use RCU annotations, but relies on those in the map code. */ void xdp_do_flush(void) { struct list_head *lh_map, *lh_dev, *lh_xsk; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) __dev_flush(lh_dev); if (lh_map) __cpu_map_flush(lh_map); if (lh_xsk) __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { struct list_head *lh_map, *lh_dev, *lh_xsk; bool missed = false; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) { __dev_flush(lh_dev); missed = true; } if (lh_map) { __cpu_map_flush(lh_map); missed = true; } if (lh_xsk) { __xsk_map_flush(lh_xsk); missed = true; } WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); if (slave && slave != xdp->rxq->dev) { /* The target device is different from the receiving device, so * redirect it to the new device. * Using XDP_REDIRECT gets the correct behaviour from XDP enabled * drivers to unmap the packet from their rx ring. */ ri->tgt_index = slave->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } return XDP_TX; } EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; err = __xsk_map_redirect(fwd, xdp); if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { err = -EOVERFLOW; goto err; } switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; break; } err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; default: err = -EBADRQC; } if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, void *fwd, enum bpf_map_type map_type, u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } if (unlikely(err)) goto err; break; case BPF_MAP_TYPE_XSKMAP: err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_generic_redirect(fwd, skb); if (unlikely(err)) goto err; break; default: err = -EBADRQC; goto err; } _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } err = xdp_ok_fwd_dev(fwd, skb->len); if (unlikely(err)) goto err; skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); generic_xdp_tx(skb, xdp_prog); return 0; } return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; /* NB! Map type UNSPEC and map_id == INT_MAX (never generated * by map_idr) is used for ifindex based XDP redirect. */ ri->tgt_index = ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } static const struct bpf_func_proto bpf_xdp_redirect_proto = { .func = bpf_xdp_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, u64, flags) { return map->ops->map_redirect(map, key, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len); return 0; } BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); } static const struct bpf_func_proto bpf_skb_event_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, u32, size, u64, flags) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; int err; if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { err = -EPROTO; goto err_clear; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) goto err_clear; set_compat: to = (struct bpf_tunnel_key *)compat; break; default: goto err_clear; } } to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); memcpy(to->local_ipv6, &info->key.u.ipv6.dst, sizeof(to->local_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy(to_orig, to, size); return 0; err_clear: memset(to_orig, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; if (unlikely(!info || !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } if (unlikely(size < info->options_len)) { err = -ENOMEM; goto err_clear; } ip_tunnel_info_opts_get(to, info); if (size > info->options_len) memset(to + info->options_len, 0, size - info->options_len); return info->options_len; err_clear: memset(to, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .func = bpf_skb_get_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, const struct bpf_tunnel_key *, from, u32, size, u64, flags) { struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; } } if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, flags & BPF_F_DONT_FRAGMENT); __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, !(flags & BPF_F_ZERO_CSUM_TX)); __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, flags & BPF_F_SEQ_NUMBER); __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; if (flags & BPF_F_TUNINFO_IPV6) { info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); memcpy(&info->key.u.ipv6.src, from->local_ipv6, sizeof(from->local_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); info->key.flow_flags = FLOWI_FLAG_ANYSRC; } return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, const u8 *, from, u32, size) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; ip_tunnel_set_options_present(present); ip_tunnel_info_opts_set(info, from, size, present); return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .func = bpf_skb_set_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { struct metadata_dst __percpu *tmp; tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, METADATA_IP_TUNNEL, GFP_KERNEL); if (!tmp) return NULL; if (cmpxchg(&md_dst, NULL, tmp)) metadata_dst_free_percpu(tmp); } switch (which) { case BPF_FUNC_skb_set_tunnel_key: return &bpf_skb_set_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_opt: return &bpf_skb_set_tunnel_opt_proto; default: return NULL; } } BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .func = bpf_skb_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SOCK_CGROUP_DATA static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .func = bpf_skb_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, int ancestor_level) { struct cgroup *ancestor; struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) return 0; return cgroup_id(ancestor); } BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) { return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); } static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { struct xdp_buff *xdp = (struct xdp_buff *)ctx; bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? __sock_gen_cookie(skb->sk) : 0; } static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .func = bpf_get_socket_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .func = bpf_get_socket_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) { return __sock_gen_cookie(ctx); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .func = bpf_get_socket_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) { return sk ? sock_gen_cookie(sk) : 0; } const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { .func = bpf_get_socket_ptr_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .func = bpf_get_socket_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static u64 __bpf_get_netns_cookie(struct sock *sk) { const struct net *net = sk ? sock_net(sk) : &init_net; return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { .func = bpf_get_netns_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .func = bpf_get_netns_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { .func = bpf_get_netns_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { .func = bpf_get_netns_cookie_sk_msg, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); kuid_t kuid; if (!sk || !sk_fullsock(sk)) return overflowuid; kuid = sock_net_uid(sock_net(sk), sk); return from_kuid_munged(sock_net(sk)->user_ns, kuid); } static const struct bpf_func_proto bpf_get_socket_uid_proto = { .func = bpf_get_socket_uid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { switch (optname) { case SO_REUSEADDR: case SO_SNDBUF: case SO_RCVBUF: case SO_KEEPALIVE: case SO_PRIORITY: case SO_REUSEPORT: case SO_RCVLOWAT: case SO_MARK: case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: if (*optlen != sizeof(int)) return -EINVAL; break; case SO_BINDTODEVICE: break; default: return -EINVAL; } if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; return sk_getsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return sk_setsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), *optlen); } static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); unsigned long timeout; int val; if (optlen != sizeof(int)) return -EINVAL; val = *(int *)optval; /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; tcp_snd_cwnd_set(tp, val); break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) return -EINVAL; tp->snd_cwnd_clamp = val; tp->snd_ssthresh = val; break; case TCP_BPF_DELACK_MAX: timeout = usecs_to_jiffies(val); if (timeout > TCP_DELACK_MAX || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_delack_max = timeout; break; case TCP_BPF_RTO_MIN: timeout = usecs_to_jiffies(val); if (timeout > TCP_RTO_MIN || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; default: return -EINVAL; } return 0; } static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { struct tcp_sock *tp; int ret; if (*optlen < 2) return -EINVAL; if (getopt) { if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; /* BPF expects NULL-terminated tcp-cc string */ optval[--(*optlen)] = '\0'; return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } /* "cdg" is the only cc that alloc a ptr * in inet_csk_ca area. The bpf-tcp-cc may * overwrite this ptr after switching to cdg. */ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; /* It stops this looping * * .init => bpf_setsockopt(tcp_cc) => .init => * bpf_setsockopt(tcp_cc)" => .init => .... * * The second bpf_setsockopt(tcp_cc) is not allowed * in order to break the loop when both .init * are the same bpf prog. * * This applies even the second bpf_setsockopt(tcp_cc) * does not cause a loop. This limits only the first * '.init' can call bpf_setsockopt(TCP_CONGESTION) to * pick a fallback cc (eg. peer does not support ECN) * and the second '.init' cannot fallback to * another. */ tp = tcp_sk(sk); if (tp->bpf_chg_cc_inprogress) return -EBUSY; tp->bpf_chg_cc_inprogress = 1; ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); tp->bpf_chg_cc_inprogress = 0; return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { case TCP_NODELAY: case TCP_MAXSEG: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_WINDOW_CLAMP: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: if (*optlen != sizeof(int)) return -EINVAL; break; case TCP_CONGESTION: return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); case TCP_SAVED_SYN: if (*optlen < 1) return -EINVAL; break; default: if (getopt) return -EINVAL; return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } if (getopt) { if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->saved_syn || *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; memcpy(optval, tp->saved_syn->data, *optlen); /* It cannot free tp->saved_syn here because it * does not know if the user space still needs it. */ return 0; } return do_tcp_getsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return do_tcp_setsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ip_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET) return -EINVAL; switch (optname) { case IP_TOS: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return do_ip_getsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return do_ip_setsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ipv6_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET6) return -EINVAL; switch (optname) { case IPV6_TCLASS: case IPV6_AUTOFLOWLABEL: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), *optlen); } static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (!sk_fullsock(sk)) return -EINVAL; if (level == SOL_SOCKET) return sol_socket_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) return sol_tcp_sockopt(sk, optname, optval, &optlen, false); return -EINVAL; } static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_setsockopt(sk, level, optname, optval, optlen); } static int __bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int err, saved_optlen = optlen; if (!sk_fullsock(sk)) { err = -EINVAL; goto done; } if (level == SOL_SOCKET) err = sol_socket_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) err = sol_ip_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); else err = -EINVAL; done: if (err) optlen = 0; if (optlen < saved_optlen) memset(optval + optlen, 0, saved_optlen - optlen); return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_getsockopt(sk, level, optname, optval, optlen); } BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_setsockopt_proto = { .func = bpf_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_getsockopt_proto = { .func = bpf_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { .func = bpf_unlocked_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { .func = bpf_unlocked_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .func = bpf_sock_addr_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .func = bpf_sock_addr_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .func = bpf_sock_ops_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, int optname, const u8 **start) { struct sk_buff *syn_skb = bpf_sock->syn_skb; const u8 *hdr_start; int ret; if (syn_skb) { /* sk is a request_sock here */ if (optname == TCP_BPF_SYN) { hdr_start = syn_skb->data; ret = tcp_hdrlen(syn_skb); } else if (optname == TCP_BPF_SYN_IP) { hdr_start = skb_network_header(syn_skb); ret = skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } else { /* optname == TCP_BPF_SYN_MAC */ hdr_start = skb_mac_header(syn_skb); ret = skb_mac_header_len(syn_skb) + skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } } else { struct sock *sk = bpf_sock->sk; struct saved_syn *saved_syn; if (sk->sk_state == TCP_NEW_SYN_RECV) /* synack retransmit. bpf_sock->syn_skb will * not be available. It has to resort to * saved_syn (if it is saved). */ saved_syn = inet_reqsk(sk)->saved_syn; else saved_syn = tcp_sk(sk)->saved_syn; if (!saved_syn) return -ENOENT; if (optname == TCP_BPF_SYN) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen + saved_syn->network_hdrlen; ret = saved_syn->tcp_hdrlen; } else if (optname == TCP_BPF_SYN_IP) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen; ret = saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } else { /* optname == TCP_BPF_SYN_MAC */ /* TCP_SAVE_SYN may not have saved the mac hdr */ if (!saved_syn->mac_hdrlen) return -ENOENT; hdr_start = saved_syn->data; ret = saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } } *start = hdr_start; return ret; } BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; const u8 *start; ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); if (ret > 0) { copy_len = ret; if (optlen < copy_len) { copy_len = optlen; ret = -ENOSPC; } memcpy(optval, start, copy_len); } /* Zero out unused buffer at the end */ memset(optval + copy_len, 0, optlen - copy_len); return ret; } return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, int, argval) { struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .func = bpf_sock_ops_cb_flags_set, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; EXPORT_SYMBOL_GPL(ipv6_bpf_stub); BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, int, addr_len) { #ifdef CONFIG_INET struct sock *sk = ctx->sk; u32 flags = BIND_FROM_BPF; int err; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_bind_proto = { .func = bpf_bind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; #ifdef CONFIG_XFRM #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) struct metadata_dst __percpu *xfrm_bpf_md_dst; EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); #endif BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { const struct sec_path *sp = skb_sec_path(skb); const struct xfrm_state *x; if (!sp || unlikely(index >= sp->len || flags)) goto err_clear; x = sp->xvec[index]; if (unlikely(size != sizeof(struct bpf_xfrm_state))) goto err_clear; to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; to->ext = 0; if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; err_clear: memset(to, 0, size); return -EINVAL; } static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { .func = bpf_skb_get_xfrm_state, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) { params->h_vlan_TCI = 0; params->h_vlan_proto = 0; if (mtu) params->mtu_result = mtu; /* union with tot_len */ return 0; } #endif #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct fib_nh_common *nhc; struct in_device *in_dev; struct neighbour *neigh; struct net_device *dev; struct fib_result res; struct flowi4 fl4; u32 mtu = 0; int err; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; fl4.flowi4_oif = params->ifindex; } else { fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.flowi4_proto = params->l4_protocol; fl4.daddr = params->ipv4_dst; fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; fl4.flowi4_multipath_hash = 0; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl4.flowi4_mark = params->mark; else fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } if (err) { /* map fib lookup errors to RTN_ type */ if (err == -EINVAL) return BPF_FIB_LKUP_RET_BLACKHOLE; if (err == -EHOSTUNREACH) return BPF_FIB_LKUP_RET_UNREACHABLE; if (err == -EACCES) return BPF_FIB_LKUP_RET_PROHIBIT; return BPF_FIB_LKUP_RET_NOT_FWDED; } if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } nhc = res.nhc; /* do not handle lwt encaps right now */ if (nhc->nhc_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) params->ipv4_src = fib_result_prefsrc(net, &res); /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ if (likely(nhc->nhc_gw_family != AF_INET6)) { if (nhc->nhc_gw_family) params->ipv4_dst = nhc->nhc_gw.ipv4; } else { struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; params->family = AF_INET6; *dst = nhc->nhc_gw.ipv6; } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; if (likely(nhc->nhc_gw_family != AF_INET6)) neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); else neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; struct fib6_result res = {}; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; struct flowi6 fl6; int strict = 0; int oif, err; u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; oif = fl6.flowi6_oif = params->ifindex; } else { oif = fl6.flowi6_iif = params->ifindex; fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; fl6.flowi6_proto = params->l4_protocol; fl6.daddr = *dst; fl6.saddr = *src; fl6.fl6_sport = params->sport; fl6.fl6_dport = params->dport; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl6.flowi6_mark = params->mark; else fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); } if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || res.f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; switch (res.fib6_type) { /* only unicast is forwarded */ case RTN_UNICAST: break; case RTN_BLACKHOLE: return BPF_FIB_LKUP_RET_BLACKHOLE; case RTN_UNREACHABLE: return BPF_FIB_LKUP_RET_UNREACHABLE; case RTN_PROHIBIT: return BPF_FIB_LKUP_RET_PROHIBIT; default: return BPF_FIB_LKUP_RET_NOT_FWDED; } ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, fl6.flowi6_oif != 0, NULL, strict); if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } if (res.nh->fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (res.nh->fib_nh_gw_family) *dst = res.nh->fib_nh_gw6; dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) { if (res.f6i->fib6_prefsrc.plen) { *src = res.f6i->fib6_prefsrc.addr; } else { err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, &fl6.daddr, 0, src); if (err) return BPF_FIB_LKUP_RET_NO_SRC_ADDR; } } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif } return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .func = bpf_xdp_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; if (params->tot_len) check_mtu = true; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; /* When tot_len isn't provided by user, check skb * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .func = bpf_skb_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, u32 ifindex) { struct net *netns = dev_net(dev_curr); /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ if (ifindex == 0) return dev_curr; return dev_get_by_index_rcu(netns, ifindex); } BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; struct net_device *dev = skb->dev; int skb_len, dev_len; int mtu; if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; skb_len += len_diff; /* minus result pass check */ if (skb_len <= dev_len) { ret = BPF_MTU_CHK_RET_SUCCESS; goto out; } /* At this point, skb->len exceed MTU, but as it include length of all * segments, it can still be below MTU. The SKB can possibly get * re-segmented in transmit path (see validate_xmit_skb). Thus, user * must choose if segs are to be MTU checked. */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; if (flags & BPF_MTU_CHK_SEGS && !skb_gso_validate_network_len(skb, mtu)) ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; } out: /* BPF verifier guarantees valid pointer */ *mtu_len = mtu; return ret; } BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { struct net_device *dev = xdp->rxq->dev; int xdp_len = xdp->data_end - xdp->data; int ret = BPF_MTU_CHK_RET_SUCCESS; int mtu, dev_len; /* XDP variant doesn't support multi-buffer segment check (yet) */ if (unlikely(flags)) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); /* Add L2-header as dev MTU is L3 size */ dev_len = mtu + dev->hard_header_len; /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ if (*mtu_len) xdp_len = *mtu_len + dev->hard_header_len; xdp_len += len_diff; /* minus result pass check */ if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; /* BPF verifier guarantees valid pointer */ *mtu_len = mtu; return ret; } static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .func = bpf_skb_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_INT, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .func = bpf_xdp_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_INT, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; if (!seg6_validate_srh(srh, len, false)) return -EINVAL; switch (type) { case BPF_LWT_ENCAP_SEG6_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EBADMSG; err = seg6_do_srh_inline(skb, srh); break; case BPF_LWT_ENCAP_SEG6: skb_reset_inner_headers(skb); skb->encapsulation = 1; err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); break; default: return -EINVAL; } bpf_compute_data_pointers(skb); if (err) return err; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); } #endif /* CONFIG_IPV6_SEG6_BPF */ #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); } #endif BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); #endif #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); #endif default: return -EINVAL; } } BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, false /* egress */); #endif default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .func = bpf_lwt_in_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .func = bpf_lwt_seg6_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { srh_state->srh = NULL; } else { srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen = srh_state->srh->hdrlen << 3; srh_state->valid = true; } } BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int hdroff = 0; int err; lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_DT6: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) return -EBADMSG; if (!pskb_pull(skb, hdroff)) return -EBADMSG; skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; bpf_compute_data_pointers(skb); bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) bpf_update_srh_state(skb); return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) bpf_update_srh_state(skb); return err; default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .func = bpf_lwt_seg6_action, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; struct ipv6hdr *hdr; int srhoff = 0; int ret; lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (unlikely(ptr < srh_tlvs || ptr > srh_end)) return -EFAULT; if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) return -EFAULT; if (len > 0) { ret = skb_cow_head(skb, len); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, offset, len); } else { ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); } bpf_compute_data_pointers(skb); if (unlikely(ret < 0)) return ret; hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; srh_state->valid = false; return 0; } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .func = bpf_lwt_seg6_adjust_srh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_IPV6_SEG6_BPF */ #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) sk = __inet_lookup(net, hinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, net->ipv4.udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, hinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, net->ipv4.udp_table, NULL); #endif } if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); sk = NULL; } return sk; } /* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. */ static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; if (len == sizeof(tuple->ipv4)) family = AF_INET; else if (len == sizeof(tuple->ipv6)) family = AF_INET6; else return NULL; if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (sdif < 0) { if (family == AF_INET) sdif = inet_sdif(skb); else sdif = inet6_sdif(skb); } if ((s32)netns_id < 0) { net = caller_net; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } out: return sk; } static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, sdif); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (!sk_fullsock(sk2)) sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } static struct sock * bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; if (skb->dev) { caller_net = dev_net(skb->dev); ifindex = skb->dev->ifindex; } else { caller_net = sock_net(skb->sk); ifindex = 0; } return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, -1); } static struct sock * bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, flags); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (!sk_fullsock(sk2)) sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .func = bpf_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .func = bpf_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .func = bpf_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { .func = bpf_tc_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { .func = bpf_tc_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { .func = bpf_tc_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_release, struct sock *, sk) { if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .func = bpf_xdp_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .func = bpf_xdp_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .func = bpf_xdp_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .func = bpf_sock_addr_skc_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .func = bpf_sock_addr_sk_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .func = bpf_sock_addr_sk_lookup_udp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, icsk_retransmits)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_tcp_sock, bytes_received): case offsetof(struct bpf_tcp_sock, bytes_acked): return size == sizeof(__u64); default: return size == sizeof(__u32); } } u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_TCP_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct tcp_sock, FIELD)); \ } while (0) #define BPF_INET_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct inet_connection_sock, \ FIELD), \ si->dst_reg, si->src_reg, \ offsetof( \ struct inet_connection_sock, \ FIELD)); \ } while (0) BTF_TYPE_EMIT(struct bpf_tcp_sock); switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; case offsetof(struct bpf_tcp_sock, snd_cwnd): BPF_TCP_SOCK_GET_COMMON(snd_cwnd); break; case offsetof(struct bpf_tcp_sock, srtt_us): BPF_TCP_SOCK_GET_COMMON(srtt_us); break; case offsetof(struct bpf_tcp_sock, snd_ssthresh): BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); break; case offsetof(struct bpf_tcp_sock, rcv_nxt): BPF_TCP_SOCK_GET_COMMON(rcv_nxt); break; case offsetof(struct bpf_tcp_sock, snd_nxt): BPF_TCP_SOCK_GET_COMMON(snd_nxt); break; case offsetof(struct bpf_tcp_sock, snd_una): BPF_TCP_SOCK_GET_COMMON(snd_una); break; case offsetof(struct bpf_tcp_sock, mss_cache): BPF_TCP_SOCK_GET_COMMON(mss_cache); break; case offsetof(struct bpf_tcp_sock, ecn_flags): BPF_TCP_SOCK_GET_COMMON(ecn_flags); break; case offsetof(struct bpf_tcp_sock, rate_delivered): BPF_TCP_SOCK_GET_COMMON(rate_delivered); break; case offsetof(struct bpf_tcp_sock, rate_interval_us): BPF_TCP_SOCK_GET_COMMON(rate_interval_us); break; case offsetof(struct bpf_tcp_sock, packets_out): BPF_TCP_SOCK_GET_COMMON(packets_out); break; case offsetof(struct bpf_tcp_sock, retrans_out): BPF_TCP_SOCK_GET_COMMON(retrans_out); break; case offsetof(struct bpf_tcp_sock, total_retrans): BPF_TCP_SOCK_GET_COMMON(total_retrans); break; case offsetof(struct bpf_tcp_sock, segs_in): BPF_TCP_SOCK_GET_COMMON(segs_in); break; case offsetof(struct bpf_tcp_sock, data_segs_in): BPF_TCP_SOCK_GET_COMMON(data_segs_in); break; case offsetof(struct bpf_tcp_sock, segs_out): BPF_TCP_SOCK_GET_COMMON(segs_out); break; case offsetof(struct bpf_tcp_sock, data_segs_out): BPF_TCP_SOCK_GET_COMMON(data_segs_out); break; case offsetof(struct bpf_tcp_sock, lost_out): BPF_TCP_SOCK_GET_COMMON(lost_out); break; case offsetof(struct bpf_tcp_sock, sacked_out): BPF_TCP_SOCK_GET_COMMON(sacked_out); break; case offsetof(struct bpf_tcp_sock, bytes_received): BPF_TCP_SOCK_GET_COMMON(bytes_received); break; case offsetof(struct bpf_tcp_sock, bytes_acked): BPF_TCP_SOCK_GET_COMMON(bytes_acked); break; case offsetof(struct bpf_tcp_sock, dsack_dups): BPF_TCP_SOCK_GET_COMMON(dsack_dups); break; case offsetof(struct bpf_tcp_sock, delivered): BPF_TCP_SOCK_GET_COMMON(delivered); break; case offsetof(struct bpf_tcp_sock, delivered_ce): BPF_TCP_SOCK_GET_COMMON(delivered_ce); break; case offsetof(struct bpf_tcp_sock, icsk_retransmits): BPF_INET_SOCK_GET_COMMON(icsk_retransmits); break; } return insn - insn_buf; } BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; } static const struct bpf_func_proto bpf_get_listener_sock_proto = { .func = bpf_get_listener_sock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); break; case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); break; default: return 0; } if (skb_headlen(skb) < iphdr_len) return 0; if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) return 0; return INET_ECN_set_ce(skb); } bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) return false; if (off % size != 0) return false; switch (off) { default: return size == sizeof(__u32); } } u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_XDP_SOCK_GET(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ sizeof_field(struct bpf_xdp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct xdp_sock, FIELD)); \ } while (0) switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); break; } return insn - insn_buf; } static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES int ret; if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) return -ENOENT; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (ret > 0) return 0; return -ENOENT; #else return -ENOTSUPP; #endif } static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .func = bpf_tcp_check_syncookie, .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES u32 cookie; u16 mss; if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) return -EINVAL; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (mss == 0) return -ENOENT; return cookie | ((u64)mss << 32); #else return -EOPNOTSUPP; #endif /* CONFIG_SYN_COOKIES */ } static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .func = bpf_tcp_gen_syncookie, .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; if (sk_unhashed(sk)) return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_pfree; return 0; } static const struct bpf_func_proto bpf_sk_assign_proto = { .func = bpf_sk_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_ANYTHING, }; static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, u8 search_kind, const u8 *magic, u8 magic_len, bool *eol) { u8 kind, kind_len; *eol = false; while (op < opend) { kind = op[0]; if (kind == TCPOPT_EOL) { *eol = true; return ERR_PTR(-ENOMSG); } else if (kind == TCPOPT_NOP) { op++; continue; } if (opend - op < 2 || opend - op < op[1] || op[1] < 2) /* Something is wrong in the received header. * Follow the TCP stack's tcp_parse_options() * and just bail here. */ return ERR_PTR(-EFAULT); kind_len = op[1]; if (search_kind == kind) { if (!magic_len) return op; if (magic_len > kind_len - 2) return ERR_PTR(-ENOMSG); if (!memcmp(&op[2], magic, magic_len)) return op; } op += kind_len; } return ERR_PTR(-ENOMSG); } BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, void *, search_res, u32, len, u64, flags) { bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; const u8 *op, *opend, *magic, *search = search_res; u8 search_kind, search_len, copy_len, magic_len; int ret; /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. */ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) return -EINVAL; search_kind = search[0]; search_len = search[1]; if (search_len > len || search_kind == TCPOPT_NOP || search_kind == TCPOPT_EOL) return -EINVAL; if (search_kind == TCPOPT_EXP || search_kind == 253) { /* 16 or 32 bit magic. +2 for kind and kind length */ if (search_len != 4 && search_len != 6) return -EINVAL; magic = &search[2]; magic_len = search_len - 2; } else { if (search_len) return -EINVAL; magic = NULL; magic_len = 0; } if (load_syn) { ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); if (ret < 0) return ret; opend = op + ret; op += sizeof(struct tcphdr); } else { if (!bpf_sock->skb || bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) /* This bpf_sock->op cannot call this helper */ return -EPERM; opend = bpf_sock->skb_data_end; op = bpf_sock->skb->data + sizeof(struct tcphdr); } op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, &eol); if (IS_ERR(op)) return PTR_ERR(op); copy_len = op[1]; ret = copy_len; if (copy_len > len) { ret = -ENOSPC; copy_len = len; } memcpy(search_res, op, copy_len); return ret; } static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .func = bpf_sock_ops_load_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, const void *, from, u32, len, u64, flags) { u8 new_kind, new_kind_len, magic_len = 0, *opend; const u8 *op, *new_op, *magic = NULL; struct sk_buff *skb; bool eol; if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) return -EPERM; if (len < 2 || flags) return -EINVAL; new_op = from; new_kind = new_op[0]; new_kind_len = new_op[1]; if (new_kind_len > len || new_kind == TCPOPT_NOP || new_kind == TCPOPT_EOL) return -EINVAL; if (new_kind_len > bpf_sock->remaining_opt_len) return -ENOSPC; /* 253 is another experimental kind */ if (new_kind == TCPOPT_EXP || new_kind == 253) { if (new_kind_len < 4) return -EINVAL; /* Match for the 2 byte magic also. * RFC 6994: the magic could be 2 or 4 bytes. * Hence, matching by 2 byte only is on the * conservative side but it is the right * thing to do for the 'search-for-duplication' * purpose. */ magic = &new_op[2]; magic_len = 2; } /* Check for duplication */ skb = bpf_sock->skb; op = skb->data + sizeof(struct tcphdr); opend = bpf_sock->skb_data_end; op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, &eol); if (!IS_ERR(op)) return -EEXIST; if (PTR_ERR(op) != -ENOMSG) return PTR_ERR(op); if (eol) /* The option has been ended. Treat it as no more * header option can be written. */ return -ENOSPC; /* No duplication found. Store the header option. */ memcpy(opend, from, new_kind_len); bpf_sock->remaining_opt_len -= new_kind_len; bpf_sock->skb_data_end += new_kind_len; return 0; } static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .func = bpf_sock_ops_store_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u32, len, u64, flags) { if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) return -EPERM; if (flags || len < 2) return -EINVAL; if (len > bpf_sock->remaining_opt_len) return -ENOSPC; bpf_sock->remaining_opt_len -= len; return 0; } static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .func = bpf_sock_ops_reserve_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, u64, tstamp, u32, tstamp_type) { /* skb_clear_delivery_time() is done for inet protocol */ if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; switch (tstamp_type) { case BPF_SKB_CLOCK_REALTIME: skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_REALTIME; break; case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; case BPF_SKB_CLOCK_TAI: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { .func = bpf_skb_set_tstamp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SYN_COOKIES BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th, u32, th_len) { u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; cookie = __cookie_v4_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th, u32, th_len) { #if IS_BUILTIN(CONFIG_IPV6) const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; cookie = __cookie_v6_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .func = bpf_tcp_raw_check_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .func = bpf_tcp_raw_check_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || func == sk_skb_change_head || func == bpf_skb_change_tail || func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == sk_skb_adjust_room || func == bpf_skb_pull_data || func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_msg_push_data || func == bpf_msg_pop_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif #ifdef CONFIG_INET func == bpf_sock_ops_store_hdr_opt || #endif func == bpf_lwt_in_push_encap || func == bpf_lwt_xmit_push_encap) return true; return false; } const struct bpf_func_proto bpf_event_output_data_proto __weak; const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_bind_proto; default: return NULL; } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; } default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_storage_get_proto __weak; const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; case BPF_FUNC_sk_cgroup_id: return &bpf_sk_cgroup_id_proto; case BPF_FUNC_sk_ancestor_cgroup_id: return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); } } static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_skb_vlan_push: return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; case BPF_FUNC_skb_change_proto: return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; case BPF_FUNC_skb_adjust_room: return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; case BPF_FUNC_redirect_peer: return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_set_hash: return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_skb_cgroup_classid: return &bpf_skb_cgroup_classid_proto; #endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; case BPF_FUNC_skb_set_tstamp: return &bpf_skb_set_tstamp_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_proto; case BPF_FUNC_xdp_load_bytes: return &bpf_xdp_load_bytes_proto; case BPF_FUNC_xdp_store_bytes: return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; case BPF_FUNC_sk_lookup_tcp: return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will * be included in vmlinux BTF, allowing both modules to refer to the * same type ID. */ BTF_TYPE_EMIT(struct nf_conn___init); #endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; case BPF_FUNC_store_hdr_opt: return &bpf_sock_ops_store_hdr_opt_proto; case BPF_FUNC_reserve_hdr_opt: return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_redirect_hash: return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; case BPF_FUNC_skb_adjust_room: return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_lwt_push_encap: return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; #endif default: return lwt_out_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, hwtstamp): if (type == BPF_WRITE || size != sizeof(__u64)) return false; break; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; break; case offsetof(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; case offsetof(struct __sk_buff, tstamp_type): return false; case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { if (size != size_default) return false; } else { bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } } return true; } static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool cg_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, enum bpf_attach_type attach_type) { switch (off) { case offsetof(struct bpf_sock, bound_dev_if): case offsetof(struct bpf_sock, mark): case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; } case bpf_ctx_range(struct bpf_sock, src_ip4): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): switch (attach_type) { case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range(struct bpf_sock, src_port): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } } read_only: return access_type == BPF_READ; full_access: return true; } bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range_till(struct bpf_sock, type, priority): return false; default: return bpf_sock_is_valid_access(off, size, type, info); } } bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_sock, state): case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case bpf_ctx_range(struct bpf_sock, dst_port): field_size = size == size_default ? size_default : sizeof_field(struct bpf_sock, dst_port); bpf_ctx_record_field_size(info, field_size); return bpf_ctx_narrow_access_ok(off, size, field_size); case offsetofend(struct bpf_sock, dst_port) ... offsetof(struct bpf_sock, dst_ip4) - 1: return false; } return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_sock_is_valid_access(off, size, type, info)) return false; return __sock_filter_check_attach_type(off, type, prog->expected_attach_type); } static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { /* Neither direct read nor direct write requires any preliminary * action. */ return 0; } static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; if (!direct_write) return 0; /* if (!skb->cloned) * goto start; * * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); /* ret = bpf_skb_pull_data(skb, 0); */ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_pull_data); /* if (!ret) * goto restore; * return TC_ACT_SHOT; */ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); *insn++ = BPF_EXIT_INSN(); /* restore: */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); /* start: */ *insn++ = prog->insnsi[0]; return insn - insn_buf; } static int bpf_gen_ld_abs(const struct bpf_insn *orig, struct bpf_insn *insn_buf) { bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf; if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } /* We're guaranteed here that CTX is in R6. */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); switch (BPF_SIZE(orig->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); break; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); *insn++ = BPF_EXIT_INSN(); return insn - insn_buf; } static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); } static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; case offsetof(struct __sk_buff, tstamp_type): /* The convert_ctx_access() on reading and writing * __sk_buff->tstamp depends on whether the bpf prog * has used __sk_buff->tstamp_type or not. * Thus, we need to set prog->tstamp_type_access * earlier during is_valid_access() here. */ ((struct bpf_prog *)prog)->tstamp_type_access = 1; return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); } DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; if (off % size != 0) return false; if (size != sizeof(__u32)) return false; return true; } static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (prog->expected_attach_type != BPF_XDP_DEVMAP) { switch (off) { case offsetof(struct xdp_md, egress_ifindex): return false; } } if (type == BPF_WRITE) { if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); } } return false; } switch (off) { case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", act > act_max ? "Illegal" : "Driver unsupported", act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_addr)) return false; if (off % size != 0) return false; /* Disallow access to fields not belonging to the attach type's address * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; } break; case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; } break; } switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (size != size_default) return false; } break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; default: if (type == BPF_READ) { if (size != size_default) return false; } else { return false; } } return true; } static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; default: return false; } } else { switch (off) { case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, bytes_acked): if (size != sizeof(__u64)) return false; break; case offsetof(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; case offsetof(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; case offsetof(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case offsetof(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; default: if (size != size_default) return false; break; } } return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, mark): return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct sk_msg_md, remote_port): case bpf_ctx_range(struct sk_msg_md, local_port): case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; break; default: return false; } return true; } static bool flow_dissector_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; if (type == BPF_WRITE) return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): if (size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_FLOW_KEYS; return true; default: return false; } } static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data)); break; case offsetof(struct __sk_buff, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data_end)); break; case offsetof(struct __sk_buff, flow_keys): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, flow_keys)); break; } return insn - insn_buf; } static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); #else BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); #endif return insn; } static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, struct bpf_insn *insn) { /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), BPF_REG_AX, skb_reg, offsetof(struct sk_buff, end)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), dst_reg, skb_reg, offsetof(struct sk_buff, head)); *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), dst_reg, skb_reg, offsetof(struct sk_buff, end)); #endif return insn; } static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. */ if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* check if ingress mask bits is set */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); *insn++ = BPF_JMP_A(4); *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); *insn++ = BPF_JMP_A(2); /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); *insn++ = BPF_JMP_A(1); } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, offsetof(struct sk_buff, tstamp)); return insn; } static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* Writing __sk_buff->tstamp as ingress, goto <clear> */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); /* <clear>: skb->tstamp_type */ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif /* <store>: skb->tstamp = tstamp */ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; } #define BPF_EMIT_STORE(size, si, off) \ BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ (si)->dst_reg, (si)->src_reg, (off), (si)->imm) static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, len, 4, target_size)); break; case offsetof(struct __sk_buff, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, protocol, 2, target_size)); break; case offsetof(struct __sk_buff, vlan_proto): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_proto, 2, target_size)); break; case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, priority, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, skb_iif, 4, target_size)); break; case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; case offsetof(struct __sk_buff, hash): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, hash, 4, target_size)); break; case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, mark, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, target_size)); break; case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); #endif break; case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ break; } if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_EMIT_STORE(BPF_H, si, off); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, queue_mapping, 2, target_size)); } break; case offsetof(struct __sk_buff, vlan_present): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_all, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); break; case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); off = si->off; off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_meta): off = si->off; off -= offsetof(struct __sk_buff, data_meta); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_meta); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_end); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, napi_id, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_family, 2, target_size)); break; case offsetof(struct __sk_buff, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_daddr, 4, target_size)); break; case offsetof(struct __sk_buff, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, 4, target_size)); break; case offsetof(struct __sk_buff, remote_ip6[0]) ... offsetof(struct __sk_buff, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, local_ip6[0]) ... offsetof(struct __sk_buff, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_dport, 2, target_size)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct __sk_buff, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; case offsetof(struct __sk_buff, tstamp): BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) insn = bpf_convert_tstamp_write(prog, si, insn); else insn = bpf_convert_tstamp_read(prog, si, insn); break; case offsetof(struct __sk_buff, tstamp_type): insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; case offsetof(struct __sk_buff, gso_size): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_size, 2, target_size)); break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); off = si->off; off -= offsetof(struct __sk_buff, wire_len); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); break; case offsetof(struct __sk_buff, hwtstamp): BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); break; } return insn - insn_buf; } u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, mark): BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); break; case offsetof(struct bpf_sock, priority): BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); break; case offsetof(struct bpf_sock, family): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_family), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_family, sizeof_field(struct sock_common, skc_family), target_size)); break; case offsetof(struct bpf_sock, type): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_type), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_type, sizeof_field(struct sock, sk_type), target_size)); break; case offsetof(struct bpf_sock, protocol): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_protocol), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_protocol, sizeof_field(struct sock, sk_protocol), target_size)); break; case offsetof(struct bpf_sock, src_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, sizeof_field(struct sock_common, skc_rcv_saddr), target_size)); break; case offsetof(struct bpf_sock, dst_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_daddr, sizeof_field(struct sock_common, skc_daddr), target_size)); break; case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, src_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off( struct sock_common, skc_v6_rcv_saddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]), target_size) + off); #else (void)off; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, dst_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_v6_daddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]), target_size) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); *target_size = 4; #endif break; case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_num, sizeof_field(struct sock_common, skc_num), target_size)); break; case offsetof(struct bpf_sock, dst_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_dport), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_dport, sizeof_field(struct sock_common, skc_dport), target_size)); break; case offsetof(struct bpf_sock, state): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_state), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_state, sizeof_field(struct sock_common, skc_state), target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_rx_queue_mapping, sizeof_field(struct sock, sk_rx_queue_mapping), target_size)); *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); #else *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); *target_size = 2; #endif break; } return insn - insn_buf; } static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_meta): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_meta)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; case offsetof(struct xdp_md, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, queue_index)); break; case offsetof(struct xdp_md, egress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, txq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_txq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; } return insn - insn_buf; } /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of * context Structure, F is Field in context structure that contains a pointer * to Nested Structure of type NS that has the field NF. * * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make * sure that SIZE is not greater than actual size of S.F.NF. * * If offset OFF is provided, the load happens from that offset relative to * offset of NF. */ #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ do { \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ si->src_reg, offsetof(S, F)); \ *insn++ = BPF_LDX_MEM( \ SIZE, si->dst_reg, si->dst_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF); \ } while (0) #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ BPF_FIELD_SIZEOF(NS, NF), 0) /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor * DST since it contains pointer to context that may be used by later * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF, \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ TF) \ do { \ if (type == BPF_WRITE) { \ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ } \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr, uaddr, sa_family); break; case offsetof(struct bpf_sock_addr, user_ip4): SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, sin_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, user_ip6[0]); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, user_port): /* To get port we need to know sa_family first and then treat * sockaddr as either sockaddr_in or sockaddr_in6. * Though we can simplify since port field has same offset and * size in both structures. * Here we check this invariant and use just one of the * structures if it's true. */ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); /* Account for sin6_port being smaller than user_port. */ port_size = min(port_size, BPF_LDST_BYTES(si)); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_family); break; case offsetof(struct bpf_sock_addr, type): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): /* Treat t_ctx as struct in_addr for msg_src_ip4. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in_addr, t_ctx, s_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_addr_kern, sk)); break; } return insn - insn_buf; } static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ OBJ_FIELD), \ si->dst_reg, si->dst_reg, \ offsetof(OBJ, OBJ_FIELD)); \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_SK() \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) /* Helper macro for adding write access to tcp_sock or sock fields. * The macro is called with two registers, dst_reg which contains a pointer * to ctx (context) and src_reg which contains the value that should be * stored. However, we need an additional register since we cannot overwrite * dst_reg because it may be used later in the program. * Instead we "borrow" one of the other register. We first save its value * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore * it at the end of the macro. */ #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int reg = BPF_REG_9; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ BPF_MEM | BPF_CLASS(si->code), \ reg, si->src_reg, \ offsetof(OBJ, OBJ_FIELD), \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } while (0) #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ do { \ if (TYPE == BPF_WRITE) \ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ else \ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, op), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, op)); break; case offsetof(struct bpf_sock_ops, replylong[0]) ... offsetof(struct bpf_sock_ops, replylong[3]): BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != sizeof_field(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; off -= offsetof(struct bpf_sock_ops, replylong[0]); off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct bpf_sock_ops, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct bpf_sock_ops, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct bpf_sock_ops, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... offsetof(struct bpf_sock_ops, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, local_ip6[0]) ... offsetof(struct bpf_sock_ops, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct bpf_sock_ops, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct bpf_sock_ops, is_fullsock): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, is_fullsock), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, is_fullsock)); break; case offsetof(struct bpf_sock_ops, state): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_state)); break; case offsetof(struct bpf_sock_ops, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct tcp_sock, rtt_min) + sizeof_field(struct minmax_sample, t)); break; case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); break; case offsetof(struct bpf_sock_ops, snd_ssthresh): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); break; case offsetof(struct bpf_sock_ops, rcv_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); break; case offsetof(struct bpf_sock_ops, snd_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); break; case offsetof(struct bpf_sock_ops, snd_una): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); break; case offsetof(struct bpf_sock_ops, mss_cache): SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); break; case offsetof(struct bpf_sock_ops, ecn_flags): SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); break; case offsetof(struct bpf_sock_ops, rate_delivered): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); break; case offsetof(struct bpf_sock_ops, rate_interval_us): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); break; case offsetof(struct bpf_sock_ops, packets_out): SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); break; case offsetof(struct bpf_sock_ops, retrans_out): SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); break; case offsetof(struct bpf_sock_ops, total_retrans): SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); break; case offsetof(struct bpf_sock_ops, segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); break; case offsetof(struct bpf_sock_ops, data_segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); break; case offsetof(struct bpf_sock_ops, segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); break; case offsetof(struct bpf_sock_ops, data_segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); break; case offsetof(struct bpf_sock_ops, lost_out): SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); break; case offsetof(struct bpf_sock_ops, sacked_out): SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); break; case offsetof(struct bpf_sock_ops, bytes_received): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); break; case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); break; case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; case offsetof(struct bpf_sock_ops, skb_data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb_data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb_data_end)); break; case offsetof(struct bpf_sock_ops, skb_data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct bpf_sock_ops, skb_len): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): off = offsetof(struct sk_buff, cb); off += offsetof(struct tcp_skb_cb, tcp_flags); *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, tcp_flags), si->dst_reg, si->dst_reg, off); break; case offsetof(struct bpf_sock_ops, skb_hwtstamp): { struct bpf_insn *jmp_on_null_skb; *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); /* Reserve one insn to test skb == NULL */ jmp_on_null_skb = insn++; insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, insn - jmp_on_null_skb - 1); break; } } return insn - insn_buf; } /* data_end = skb->data + skb_headlen() */ static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { int reg; int temp_reg_off = offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, temp_reg); if (si->src_reg == si->dst_reg) { /* We need an extra register, choose and save a register. */ reg = BPF_REG_9; if (si->src_reg == reg || si->dst_reg == reg) reg--; if (si->src_reg == reg || si->dst_reg == reg) reg--; *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); } else { reg = si->dst_reg; } /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); /* reg = skb->data + skb->len */ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); /* reg = skb->data + skb->len - skb->data_len */ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); if (si->src_reg == si->dst_reg) { /* Restore the saved register */ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); *insn++ = BPF_MOV64_REG(si->dst_reg, reg); *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); } return insn; } static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct sk_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #if IS_ENABLED(CONFIG_IPV6) int off; #endif /* convert ctx uses the fact sg element is first in struct */ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct sk_msg_md, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct sk_msg_md, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct sk_msg_md, remote_ip6[0]) ... offsetof(struct sk_msg_md, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, local_ip6[0]) ... offsetof(struct sk_msg_md, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct sk_msg_md, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct sk_msg_md, size): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; case offsetof(struct sk_msg_md, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_in_verifier_ops = { .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_in_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_out_verifier_ops = { .get_func_proto = lwt_out_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, }; const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { .get_func_proto = lwt_seg6local_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { }; const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { }; const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { .get_func_proto = sock_addr_func_proto, .is_valid_access = sock_addr_is_valid_access, .convert_ctx_access = sock_addr_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_addr_prog_ops = { }; const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; const struct bpf_prog_ops sock_ops_prog_ops = { }; const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; const struct bpf_prog_ops sk_skb_prog_ops = { }; const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { }; const struct bpf_verifier_ops flow_dissector_verifier_ops = { .get_func_proto = flow_dissector_func_proto, .is_valid_access = flow_dissector_is_valid_access, .convert_ctx_access = flow_dissector_convert_ctx_access, }; const struct bpf_prog_ops flow_dissector_prog_ops = { .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) goto out; /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. eBPF programs that * have no original program cannot be dumped through this. */ ret = -EACCES; fprog = filter->prog->orig_prog; if (!fprog) goto out; ret = fprog->len; if (!len) /* User space only enquires number of filter blocks. */ goto out; ret = -EINVAL; if (len < fprog->len) goto out; ret = -EFAULT; if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number * of filter blocks. */ ret = fprog->len; out: sockopt_release_sock(sk); return ret; } #ifdef CONFIG_INET static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; reuse_kern->bind_inany = reuse->bind_inany; } struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; else return ERR_PTR(-ECONNREFUSED); } BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, struct bpf_map *, map, void *, key, u32, flags) { bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) return -ENOENT; reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ if (sk_is_refcounted(selected_sk)) sock_put(selected_sk); /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. * * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ return is_sockarray ? -ENOENT : -EINVAL; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; if (sk->sk_protocol != selected_sk->sk_protocol) return -EPROTOTYPE; else if (sk->sk_family != selected_sk->sk_family) return -EAFNOSUPPORT; /* Catch all. Likely bound to a different sockaddr. */ return -EBADFD; } reuse_kern->selected_sk = selected_sk; return 0; } static const struct bpf_func_proto sk_select_reuseport_proto = { .func = sk_select_reuseport, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(sk_reuseport_load_bytes, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len) { return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); } static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { .func = sk_reuseport_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(sk_reuseport_load_bytes_relative, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len, u32, start_header) { return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, len, start_header); } static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { .func = sk_reuseport_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_reuseport_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sk_select_reuseport: return &sk_select_reuseport_proto; case BPF_FUNC_skb_load_bytes: return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool sk_reuseport_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const u32 size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct sk_reuseport_md) || off % size || type != BPF_READ) return false; switch (off) { case offsetof(struct sk_reuseport_md, data): info->reg_type = PTR_TO_PACKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, data_end): info->reg_type = PTR_TO_PACKET_END; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, hash): return size == size_default; case offsetof(struct sk_reuseport_md, sk): info->reg_type = PTR_TO_SOCKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, migrating_sk): info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; return size == sizeof(__u64); /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) return false; fallthrough; case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): case bpf_ctx_range(struct sk_reuseport_md, bind_inany): case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); default: return false; } } #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ si->dst_reg, si->src_reg, \ bpf_target_off(struct sk_reuseport_kern, F, \ sizeof_field(struct sk_reuseport_kern, F), \ target_size)); \ }) #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sk_buff, \ skb, \ SKB_FIELD) #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sock, \ sk, \ SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct sk_reuseport_md, data): SK_REUSEPORT_LOAD_SKB_FIELD(data); break; case offsetof(struct sk_reuseport_md, len): SK_REUSEPORT_LOAD_SKB_FIELD(len); break; case offsetof(struct sk_reuseport_md, eth_protocol): SK_REUSEPORT_LOAD_SKB_FIELD(protocol); break; case offsetof(struct sk_reuseport_md, ip_protocol): SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): SK_REUSEPORT_LOAD_FIELD(data_end); break; case offsetof(struct sk_reuseport_md, hash): SK_REUSEPORT_LOAD_FIELD(hash); break; case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; case offsetof(struct sk_reuseport_md, sk): SK_REUSEPORT_LOAD_FIELD(sk); break; case offsetof(struct sk_reuseport_md, migrating_sk): SK_REUSEPORT_LOAD_FIELD(migrating_sk); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_reuseport_verifier_ops = { .get_func_proto = sk_reuseport_func_proto, .is_valid_access = sk_reuseport_is_valid_access, .convert_ctx_access = sk_reuseport_convert_ctx_access, }; const struct bpf_prog_ops sk_reuseport_prog_ops = { }; DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); EXPORT_SYMBOL(bpf_sk_lookup_enabled); BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, struct sock *, sk, u64, flags) { if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | BPF_SK_LOOKUP_F_NO_REUSEPORT))) return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) return -EPROTOTYPE; if (sk && sk->sk_family != ctx->family && (sk->sk_family == AF_INET || ipv6_only_sock(sk))) return -EAFNOSUPPORT; if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) return -EEXIST; /* Select socket as lookup result */ ctx->selected_sk = sk; ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; return 0; } static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { .func = bpf_sk_lookup_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, .arg3_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_assign: return &bpf_sk_lookup_assign_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static bool sk_lookup_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) return false; if (off % size != 0) return false; if (type != BPF_READ) return false; switch (off) { case offsetof(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); case bpf_ctx_range(struct bpf_sk_lookup, family): case bpf_ctx_range(struct bpf_sk_lookup, protocol): case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct bpf_sk_lookup, local_port): case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); case bpf_ctx_range(struct bpf_sk_lookup, remote_port): /* Allow 4-byte access to 2-byte field for backward compatibility */ if (size == sizeof(__u32)) return true; bpf_ctx_record_field_size(info, sizeof(__be16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); case offsetofend(struct bpf_sk_lookup, remote_port) ... offsetof(struct bpf_sk_lookup, local_ip4) - 1: /* Allow access to zero padding for backward compatibility */ bpf_ctx_record_field_size(info, sizeof(__u16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); default: return false; } } static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sk_lookup, sk): *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, selected_sk)); break; case offsetof(struct bpf_sk_lookup, family): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, family, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, protocol, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, remote_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.saddr, 4, target_size)); break; case offsetof(struct bpf_sk_lookup, local_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.daddr, 4, target_size)); break; case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.saddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.daddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case offsetof(struct bpf_sk_lookup, remote_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, sport, 2, target_size)); break; case offsetofend(struct bpf_sk_lookup, remote_port): *target_size = 2; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); break; case offsetof(struct bpf_sk_lookup, local_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, dport, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, ingress_ifindex, 4, target_size)); break; } return insn - insn_buf; } const struct bpf_prog_ops sk_lookup_prog_ops = { .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { .get_func_proto = sk_lookup_func_proto, .is_valid_access = sk_lookup_is_valid_access, .convert_ctx_access = sk_lookup_convert_ctx_access, }; #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .func = bpf_skc_to_tcp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .func = bpf_skc_to_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) { /* BTF types for tcp_timewait_sock and inet_timewait_sock are not * generated if CONFIG_INET=n. Trigger an explicit generation here. */ BTF_TYPE_EMIT(struct inet_timewait_sock); BTF_TYPE_EMIT(struct tcp_timewait_sock); #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .func = bpf_skc_to_tcp_timewait_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) { #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .func = bpf_skc_to_tcp_request_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) { /* udp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct udp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .func = bpf_skc_to_udp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) { /* unix_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct unix_sock); if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .func = bpf_skc_to_unix_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) { BTF_TYPE_EMIT(struct mptcp_sock); return (unsigned long)bpf_mptcp_sock_from_subflow(sk); } const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { .func = bpf_skc_to_mptcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], }; BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); } BTF_ID_LIST(bpf_sock_from_file_btf_ids) BTF_ID(struct, socket) BTF_ID(struct, file) const struct bpf_func_proto bpf_sock_from_file_proto = { .func = bpf_sock_from_file, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = &bpf_sock_from_file_btf_ids[0], .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], }; static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; switch (func_id) { case BPF_FUNC_skc_to_tcp6_sock: func = &bpf_skc_to_tcp6_sock_proto; break; case BPF_FUNC_skc_to_tcp_sock: func = &bpf_skc_to_tcp_sock_proto; break; case BPF_FUNC_skc_to_tcp_timewait_sock: func = &bpf_skc_to_tcp_timewait_sock_proto; break; case BPF_FUNC_skc_to_tcp_request_sock: func = &bpf_skc_to_tcp_request_sock_proto; break; case BPF_FUNC_skc_to_udp6_sock: func = &bpf_skc_to_udp6_sock_proto; break; case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; case BPF_FUNC_skc_to_mptcp_sock: func = &bpf_skc_to_mptcp_sock_proto; break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; } __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct sk_buff *skb = (struct sk_buff *)s; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); return 0; } __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct xdp_buff *xdp = (struct xdp_buff *)x; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); return 0; } __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, const u8 *sun_path, u32 sun_path__sz) { struct sockaddr_un *un; if (sa_kern->sk->sk_family != AF_UNIX) return -EINVAL; /* We do not allow changing the address to unnamed or larger than the * maximum allowed address size for a unix sockaddr. */ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) return -EINVAL; un = (struct sockaddr_un *)sa_kern->uaddr; memcpy(un->sun_path, sun_path, sun_path__sz); sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; return 0; } __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, struct bpf_tcp_req_attrs *attrs, int attrs__sz) { #if IS_ENABLED(CONFIG_SYN_COOKIES) struct sk_buff *skb = (struct sk_buff *)s; const struct request_sock_ops *ops; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; struct net *net; __u16 min_mss; u32 tsoff = 0; if (attrs__sz != sizeof(*attrs) || attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EINVAL; net = dev_net(skb->dev); if (net != sock_net(sk)) return -ENETUNREACH; switch (skb->protocol) { case htons(ETH_P_IP): ops = &tcp_request_sock_ops; min_mss = 536; break; #if IS_BUILTIN(CONFIG_IPV6) case htons(ETH_P_IPV6): ops = &tcp6_request_sock_ops; min_mss = IPV6_MIN_MTU - 60; break; #endif default: return -EINVAL; } if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || sk_is_mptcp(sk)) return -EINVAL; if (attrs->mss < min_mss) return -EINVAL; if (attrs->wscale_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) return -EINVAL; if (attrs->snd_wscale > TCP_MAX_WSCALE || attrs->rcv_wscale > TCP_MAX_WSCALE) return -EINVAL; } if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return -EINVAL; if (attrs->tstamp_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return -EINVAL; tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); } req = inet_reqsk_alloc(ops, sk, false); if (!req) return -ENOMEM; ireq = inet_rsk(req); treq = tcp_rsk(req); req->rsk_listener = sk; req->syncookie = 1; req->mss = attrs->mss; req->ts_recent = attrs->rcv_tsval; ireq->snd_wscale = attrs->snd_wscale; ireq->rcv_wscale = attrs->rcv_wscale; ireq->tstamp_ok = !!attrs->tstamp_ok; ireq->sack_ok = !!attrs->sack_ok; ireq->wscale_ok = !!attrs->wscale_ok; ireq->ecn_ok = !!attrs->ecn_ok; treq->req_usec_ts = !!attrs->usec_ts_ok; treq->ts_off = tsoff; skb_orphan(skb); skb->sk = req_to_sk(req); skb->destructor = sock_pfree; return 0; #else return -EOPNOTSUPP; #endif } __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; int err; err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); if (err) return err; bpf_dynptr_set_rdonly(ptr); return 0; } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, }; static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, }; static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_sock_addr, }; static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_tcp_reqsk, }; static int __init bpf_kfunc_init(void) { int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); __bpf_kfunc_start_defs(); /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. * * The function expects a non-NULL pointer to a socket, and invokes the * protocol specific socket destroy handlers. * * The helper can only be called from BPF contexts that have acquired the socket * locks. * * Parameters: * @sock: Pointer to socket to be destroyed * * Return: * On error, may return EPROTONOSUPPORT, EINVAL. * EPROTONOSUPPORT if protocol specific destroy handler is not supported. * 0 otherwise */ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) { struct sock *sk = (struct sock *)sock; /* The locking semantics that allow for synchronous execution of the * destroy handlers are only supported for TCP and UDP. * Supporting protocols will need to acquire sock lock in the BPF context * prior to invoking this kfunc. */ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_UDP)) return -EOPNOTSUPP; return sk->sk_prot->diag_destroy(sk, ECONNABORTED); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && prog->expected_attach_type != BPF_TRACE_ITER) return -EACCES; return 0; } static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_sk_iter_kfunc_ids, .filter = tracing_iter_filter, }; static int init_subsystem(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); } late_initcall(init_subsystem);
9 24 4 85 122 6 2 2 4 109 81 82 80 77 5 27 49 2 1 77 3 74 103 35 71 239 2 11 226 2 1 1 1 51 27 25 19 10 4 5 5 1 5 5 5 19 16 2 3 11 6 1 2 1 3 10 1 2 1 7 7 42 7 7 21 21 16 5 8 2 5 1 5 1 1 3 10 3 2 5 3 1 39 1 1 38 13 7 2 1 2 2 25 1 2 15 3 4 6 4 1 2 9 8 3 6 6 8 2 3 3 2 1 4 6 40 40 5 5 5 5 13 1 12 5 5 5 2 2 3 1 2 2 4 2 2 4 4 2 2 2 2 2 8 3 2 4 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> #include <linux/perf_event.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { int i; for (i = 0; i < array->map.max_entries; i++) { free_percpu(array->pptrs[i]); cond_resched(); } } static int bpf_array_alloc_percpu(struct bpf_array *array) { void __percpu *ptr; int i; for (i = 0; i < array->map.max_entries; i++) { ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, GFP_USER | __GFP_NOWARN); if (!ptr) { bpf_array_free_percpu(array); return -ENOMEM; } array->pptrs[i] = ptr; cond_resched(); } return 0; } /* Called from syscall */ int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; /* avoid overflow on round_up(map->value_size) */ if (attr->value_size > INT_MAX) return -E2BIG; return 0; } static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL); u64 array_size, mask64; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; /* On 32 bit archs roundup_pow_of_two() with max_entries that has * upper most bit set in u32 space is undefined behavior due to * resulting 1U << 32, so do it manually here in u64 space. */ mask64 = fls_long(max_entries - 1); mask64 = 1ULL << mask64; mask64 -= 1; index_mask = mask64; if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; /* Check for overflows. */ if (max_entries < attr->max_entries) return ERR_PTR(-E2BIG); } array_size = sizeof(*array); if (percpu) { array_size += (u64) max_entries * sizeof(void *); } else { /* rely on vmalloc() to return page-aligned memory and * ensure array->value is exactly page-aligned */ if (attr->map_flags & BPF_F_MMAPABLE) { array_size = PAGE_ALIGN(array_size); array_size += PAGE_ALIGN((u64) max_entries * elem_size); } else { array_size += (u64) max_entries * elem_size; } } /* allocate all map elements and zero-initialize them */ if (attr->map_flags & BPF_F_MMAPABLE) { void *data; /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ data = bpf_map_area_mmapable_alloc(array_size, numa_node); if (!data) return ERR_PTR(-ENOMEM); array = data + PAGE_ALIGN(sizeof(struct bpf_array)) - offsetof(struct bpf_array, value); } else { array = bpf_map_area_alloc(array_size, numa_node); } if (!array) return ERR_PTR(-ENOMEM); array->index_mask = index_mask; array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } return &array->map; } static void *array_map_elem_ptr(struct bpf_array* array, u32 index) { return array->value + (u64)array->elem_size * index; } /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + (u64)array->elem_size * (index & array->index_mask); } static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { struct bpf_array *array = container_of(map, struct bpf_array, map); if (map->max_entries != 1) return -ENOTSUPP; if (off >= map->value_size) return -EINVAL; *imm = (unsigned long)array->value; return 0; } static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, u32 *off) { struct bpf_array *array = container_of(map, struct bpf_array, map); u64 base = (unsigned long)array->value; u64 range = array->elem_size; if (map->max_entries != 1) return -ENOTSUPP; if (imm < base || imm >= base + range) return -ENOENT; *off = imm - base; return 0; } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = array->elem_size; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; if (map->map_flags & BPF_F_INNER_MAP) return -EOPNOTSUPP; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); } else { *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); } *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } /* Called from eBPF program */ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return this_cpu_ptr(array->pptrs[index & array->index_mask]); } /* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; if (!bpf_jit_supports_percpu_insn()) return -EOPNOTSUPP; if (map->map_flags & BPF_F_INNER_MAP) return -EOPNOTSUPP; BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); } *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); return insn - insn_buf; } static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (cpu >= nr_cpu_ids) return NULL; if (unlikely(index >= array->map.max_entries)) return NULL; return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(index >= array->map.max_entries)) return -ENOENT; /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } rcu_read_unlock(); return 0; } /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Called from syscall or from eBPF program */ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; char *val; if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); if (map_flags & BPF_F_LOCK) copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); bpf_obj_free_fields(array->map.record, val); } return 0; } int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; /* the user space will provide round_up(value_size, 8) bytes that * will be copied into per-cpu area. bpf programs can only access * value_size of it. During lookup the same extra bytes will be * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); return 0; } /* Called from syscall or from eBPF program */ static long array_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } static void *array_map_vmalloc_addr(struct bpf_array *array) { return (void *)round_down((unsigned long)array, PAGE_SIZE); } static void array_map_free_timers_wq(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); } } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; if (!IS_ERR_OR_NULL(map->record)) { if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { for (i = 0; i < array->map.max_entries; i++) { void __percpu *pptr = array->pptrs[i & array->index_mask]; int cpu; for_each_possible_cpu(cpu) { bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } } else { for (i = 0; i < array->map.max_entries; i++) bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i)); } } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); if (array->map.map_flags & BPF_F_MMAPABLE) bpf_map_area_free(array_map_vmalloc_addr(array)); else bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void *value; rcu_read_lock(); value = array_map_lookup_elem(map, key); if (!value) { rcu_read_unlock(); return; } if (map->btf_key_type_id) seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); rcu_read_unlock(); } static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu; rcu_read_lock(); seq_printf(m, "%u: {\n", *(u32 *)key); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); seq_puts(m, "\n"); } seq_puts(m, "}\n"); rcu_read_unlock(); } static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { u32 int_data; /* One exception for keyless BTF: .bss/.data/.rodata map */ if (btf_type_is_void(key_type)) { if (map->map_type != BPF_MAP_TYPE_ARRAY || map->max_entries != 1) return -EINVAL; if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) return -EINVAL; return 0; } if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); /* bpf array can only take a u32 key. This check makes sure * that the btf matches the attr used during map_create. */ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; } static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_array *array = container_of(map, struct bpf_array, map); pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; if (!(map->map_flags & BPF_F_MMAPABLE)) return -EINVAL; if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) return -EINVAL; return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), vma->vm_pgoff + pgoff); } static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { if (!bpf_map_meta_equal(meta0, meta1)) return false; return meta0->map_flags & BPF_F_INNER_MAP ? true : meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { struct bpf_map *map; void *percpu_value_buf; u32 index; }; static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_map *map = info->map; struct bpf_array *array; u32 index; if (info->index >= map->max_entries) return NULL; if (*pos == 0) ++*pos; array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; return array_map_elem_ptr(array, index); } static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_map *map = info->map; struct bpf_array *array; u32 index; ++*pos; ++info->index; if (info->index >= map->max_entries) return NULL; array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) return array->pptrs[index]; return array_map_elem_ptr(array, index); } static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; void __percpu **pptr; u32 size; meta.seq = seq; prog = bpf_iter_get_info(&meta, v == NULL); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; if (!info->percpu_value_buf) { ctx.value = v; } else { pptr = v; size = array->elem_size; for_each_possible_cpu(cpu) { copy_map_value_long(map, info->percpu_value_buf + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, info->percpu_value_buf + off); off += size; } ctx.value = info->percpu_value_buf; } } return bpf_iter_run_prog(prog, &ctx); } static int bpf_array_map_seq_show(struct seq_file *seq, void *v) { return __bpf_array_map_seq_show(seq, v); } static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) { if (!v) (void)__bpf_array_map_seq_show(seq, NULL); } static int bpf_iter_init_array_map(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; struct bpf_array *array = container_of(map, struct bpf_array, map); void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { buf_size = array->elem_size * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; seq_info->percpu_value_buf = value_buf; } /* bpf_iter_attach_map() acquires a map uref, and the uref may be * released before or in the middle of iterating map elements, so * acquire an extra map uref for iterator. */ bpf_map_inc_with_uref(map); seq_info->map = map; return 0; } static void bpf_iter_fini_array_map(void *priv_data) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } static const struct seq_operations bpf_array_map_seq_ops = { .start = bpf_array_map_seq_start, .next = bpf_array_map_seq_next, .stop = bpf_array_map_seq_stop, .show = bpf_array_map_seq_show, }; static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_array_map_seq_ops, .init_seq_private = bpf_iter_init_array_map, .fini_seq_private = bpf_iter_fini_array_map, .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags) { u32 i, key, num_elems = 0; struct bpf_array *array; bool is_percpu; u64 ret = 0; void *val; if (flags != 0) return -EINVAL; is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; array = container_of(map, struct bpf_array, map); if (is_percpu) migrate_disable(); for (i = 0; i < map->max_entries; i++) { if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); else val = array_map_elem_ptr(array, i); num_elems++; key = i; ret = callback_fn((u64)(long)map, (u64)(long)&key, (u64)(long)val, (u64)(long)callback_ctx, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) break; } if (is_percpu) migrate_enable(); return num_elems; } static u64 array_map_mem_usage(const struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; u32 elem_size = array->elem_size; u64 entries = map->max_entries; u64 usage = sizeof(*array); if (percpu) { usage += entries * sizeof(void *); usage += entries * elem_size * num_possible_cpus(); } else { if (map->map_flags & BPF_F_MMAPABLE) { usage = PAGE_ALIGN(usage); usage += PAGE_ALIGN(entries * elem_size); } else { usage += entries * elem_size; } } return usage; } BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_release_uref = array_map_free_timers_wq, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; const struct bpf_map_ops percpu_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; /* Program read-only/write-only not supported for special maps yet. */ if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) return -EINVAL; return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); bpf_map_area_free(array); } static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } /* only called from syscall */ int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) { void **elem, *ptr; int ret = 0; if (!map->ops->map_fd_sys_lookup_elem) return -ENOTSUPP; rcu_read_lock(); elem = array_map_lookup_elem(map, key); if (elem && (ptr = READ_ONCE(*elem))) *value = map->ops->map_fd_sys_lookup_elem(ptr); else ret = -ENOENT; rcu_read_unlock(); return ret; } /* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) return -EINVAL; if (index >= array->map.max_entries) return -E2BIG; ufd = *(u32 *)value; new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); if (map->ops->map_poke_run) { mutex_lock(&array->aux->poke_mutex); old_ptr = xchg(array->ptrs + index, new_ptr); map->ops->map_poke_run(map, index, old_ptr, new_ptr); mutex_unlock(&array->aux->poke_mutex); } else { old_ptr = xchg(array->ptrs + index, new_ptr); } if (old_ptr) map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; if (map->ops->map_poke_run) { mutex_lock(&array->aux->poke_mutex); old_ptr = xchg(array->ptrs + index, NULL); map->ops->map_poke_run(map, index, old_ptr, NULL); mutex_unlock(&array->aux->poke_mutex); } else { old_ptr = xchg(array->ptrs + index, NULL); } if (old_ptr) { map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } static long fd_array_map_delete_elem(struct bpf_map *map, void *key) { return __fd_array_map_delete_elem(map, key, true); } static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_prog *prog = bpf_prog_get(fd); if (IS_ERR(prog)) return prog; if (!bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } return prog; } static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } static u32 prog_fd_array_sys_lookup_elem(void *ptr) { return ((struct bpf_prog *)ptr)->aux->id; } /* decrement refcnt of all bpf_progs that are stored in this map */ static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void **elem, *ptr; u32 prog_id; rcu_read_lock(); elem = array_map_lookup_elem(map, key); if (elem) { ptr = READ_ONCE(*elem); if (ptr) { seq_printf(m, "%u: ", *(u32 *)key); prog_id = prog_fd_array_sys_lookup_elem(ptr); btf_type_seq_show(map->btf, map->btf_value_type_id, &prog_id, m); seq_puts(m, "\n"); } } rcu_read_unlock(); } struct prog_poke_elem { struct list_head list; struct bpf_prog_aux *aux; }; static int prog_array_map_poke_track(struct bpf_map *map, struct bpf_prog_aux *prog_aux) { struct prog_poke_elem *elem; struct bpf_array_aux *aux; int ret = 0; aux = container_of(map, struct bpf_array, map)->aux; mutex_lock(&aux->poke_mutex); list_for_each_entry(elem, &aux->poke_progs, list) { if (elem->aux == prog_aux) goto out; } elem = kmalloc(sizeof(*elem), GFP_KERNEL); if (!elem) { ret = -ENOMEM; goto out; } INIT_LIST_HEAD(&elem->list); /* We must track the program's aux info at this point in time * since the program pointer itself may not be stable yet, see * also comment in prog_array_map_poke_run(). */ elem->aux = prog_aux; list_add_tail(&elem->list, &aux->poke_progs); out: mutex_unlock(&aux->poke_mutex); return ret; } static void prog_array_map_poke_untrack(struct bpf_map *map, struct bpf_prog_aux *prog_aux) { struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; mutex_lock(&aux->poke_mutex); list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { if (elem->aux == prog_aux) { list_del_init(&elem->list); kfree(elem); break; } } mutex_unlock(&aux->poke_mutex); } void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old) { WARN_ON_ONCE(1); } static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { struct prog_poke_elem *elem; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; /* Few things to be aware of: * * 1) We can only ever access aux in this context, but * not aux->prog since it might not be stable yet and * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms * entry. We skip these as poke->tailcall_target_stable * is not active yet. The JIT will do the final fixup * before setting it stable. The various * poke->tailcall_target_stable are successively * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; if (poke->tail_call.map != map || poke->tail_call.key != key) continue; bpf_arch_poke_desc_update(poke, new, old); } } } static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; bpf_fd_array_map_clear(map, true); bpf_map_put(map); } static void prog_array_map_clear(struct bpf_map *map) { struct bpf_array_aux *aux = container_of(map, struct bpf_array, map)->aux; bpf_map_inc(map); schedule_work(&aux->work); } static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) { struct bpf_array_aux *aux; struct bpf_map *map; aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); if (!aux) return ERR_PTR(-ENOMEM); INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); map = array_map_alloc(attr); if (IS_ERR(map)) { kfree(aux); return map; } container_of(map, struct bpf_array, map)->aux = aux; aux->map = map; return map; } static void prog_array_map_free(struct bpf_map *map) { struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { list_del_init(&elem->list); kfree(elem); } kfree(aux); fd_array_map_free(map); } /* prog_array->aux->{type,jited} is a runtime binding. * Doing static check alone in the verifier is not enough. * Thus, prog_array_map cannot be used as an inner_map * and map_meta_equal is not implemented. */ const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, .map_free = prog_array_map_free, .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, struct file *map_file) { struct bpf_event_entry *ee; ee = kzalloc(sizeof(*ee), GFP_KERNEL); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; ee->map_file = map_file; } return ee; } static void __bpf_event_entry_free(struct rcu_head *rcu) { struct bpf_event_entry *ee; ee = container_of(rcu, struct bpf_event_entry, rcu); fput(ee->perf_file); kfree(ee); } static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) { call_rcu(&ee->rcu, __bpf_event_entry_free); } static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; u64 value; perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); if (ee) return ee; ee = ERR_PTR(-ENOMEM); err_out: fput(perf_file); return ee; } static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } static void perf_event_fd_array_release(struct bpf_map *map, struct file *map_file) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_event_entry *ee; int i; if (map->map_flags & BPF_F_PRESERVE_ELEMS) return; rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; #ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) { return cgroup_get_from_fd(fd); } static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); } static void cgroup_fd_array_free(struct bpf_map *map) { bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } const struct bpf_map_ops cgroup_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; #endif static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) { struct bpf_map *map, *inner_map_meta; inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); if (IS_ERR(inner_map_meta)) return inner_map_meta; map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; } map->inner_map_meta = inner_map_meta; return map; } static void array_of_map_free(struct bpf_map *map) { /* map->inner_map_meta is only accessed by syscall which * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_map **inner_map = array_map_lookup_elem(map, key); if (!inner_map) return NULL; return READ_ONCE(*inner_map); } static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = array->elem_size; struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], };
5146 506 142 477 9 431 47 4 45 47 46 467 9 468 501 501 146 1 1 472 231 5 61 501 23 35 29 30 225 82 3 123 9 1 459 446 10 9 1 9 9 449 394 357 359 11 11 11 348 303 56 13 125 125 124 69 66 8 11 19 19 19 35 36 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 // SPDX-License-Identifier: GPL-2.0-only /* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * * 09Apr2002 Andrew Morton * Initial version. */ /** * DOC: Readahead Overview * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the * readahead flag set. This flag indicates that the folio was read * as part of a previous readahead request and now that it has been * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async * readahead. This is reflected in the struct file_ra_state which * contains ->size being the total number of pages, and ->async_size * which is the number of pages in the async section. The readahead * flag will be set on the first folio in this async section to trigger * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and * all readahead request will be fully asynchronous. * * When either of the triggers causes a readahead, three numbers need * to be determined: the start of the region to read, the size of the * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page * cache. This is found with a simple search in the page cache. * * The size of the async tail is determined by subtracting the size that * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new * readahead, though see get_next_ra_size() for details. * * If the size of the previous read cannot be determined, the number of * preceding pages in the page cache is used to estimate the size of * a previous read. This estimate could easily be misled by random * reads being coincidentally adjacent, so it is ignored unless it is * larger than the current request, and it is not scaled up, unless it * is at the start of file. * * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * * The above calculation, based on the previous readahead size, * determines the size of the readahead, to which any requested read * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * * ->readahead() will generally call readahead_folio() repeatedly to get * each folio from those prepared for readahead. It may fail to read a * folio by: * * * not calling readahead_folio() sufficiently many times, effectively * ignoring some folios, as might be appropriate if the path to * storage is congested. * * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * * In the last two cases, the folio should be unlocked by the filesystem * to indicate that the read attempt has failed. In the first case the * folio will be unlocked by the VFS. * * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. * In this case it is best to use filemap_remove_folio() to remove the * folios from the page cache as is automatically done for folios that * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using * ->read_folio() which may be less efficient. */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> #include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) return; if (unlikely(rac->_workingset)) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { aops->readahead(rac); /* * Clean up the remaining folios. The sizes in ->ra * may be used to size the next readahead, so make sure * they accurately reflect what happened. */ while ((folio = readahead_folio(rac)) != NULL) { unsigned long nr = folio_nr_pages(folio); folio_get(folio); rac->ra->size -= nr; if (rac->ra->async_size >= nr) { rac->ra->async_size -= nr; filemap_remove_folio(folio); } folio_unlock(folio); folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); if (unlikely(rac->_workingset)) psi_memstall_leave(&rac->_pflags); rac->_workingset = false; BUG_ON(readahead_count(rac)); } /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * * This function is for filesystems to call when they want to start * readahead beyond a file's stated i_size. This is almost certainly * not the function you want to call. Use page_cache_async_readahead() * or page_cache_sync_readahead() instead. * * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long i; /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted * them for I/O. Adding another page may need to allocate memory, * which can trigger memory reclaim. Telling the VM we're in * the middle of a filesystem operation will cause it to not * touch file-backed pages, preventing a deadlock. Most (all?) * filesystems already specify __GFP_NOFS in their mapping's * gfp_mask, but let's be explicit here. */ unsigned int nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); /* * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { struct folio *folio = xa_load(&mapping->i_pages, index + i); int ret; if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the * next batch. This page may be the one we would * have intended to mark as Readahead, but we don't * have a stable reference to this page, and it's * not worth getting one just for that. */ read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) break; ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); if (ret < 0) { folio_put(folio); if (ret == -ENOMEM) break; read_pages(ractl); ractl->_index++; i = ractl->_index + ractl->_nr_pages - index - 1; continue; } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } /* * Now start the IO. We ignore I/O errors - if the folio is not * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; end_index = (isize - 1) >> PAGE_SHIFT; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages; if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; do_page_cache_ra(ractl, this_chunk, 0); nr_to_read -= this_chunk; } } /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* * Get the previous window size, ramp it up, and * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, unsigned long max) { unsigned long cur = ra->size; if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur; return max; } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * * |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead * * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; * Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. * * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead * page at (start+size-async_size) with PG_readahead, and use it as readahead * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; struct folio *folio = filemap_alloc_folio(gfp, order); if (!folio) return -ENOMEM; mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) { folio_put(folio); return err; } ractl->_nr_pages += 1UL << order; ractl->_workingset |= folio_test_workingset(folio); return 0; } void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; pgoff_t start = readahead_index(ractl); pgoff_t index = start; pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); if (!mapping_large_folio_support(mapping) || ra->size < 4) goto fallback; limit = min(limit, index + ra->size - 1); if (new_order < MAX_PAGECACHE_ORDER) new_order += 2; new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); while (index <= limit) { unsigned int order = new_order; /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ while (index + (1UL << order) - 1 > limit) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; index += 1UL << order; } read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this * situation. */ if (!err) return; fallback: do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); unsigned long max_pages = ractl->ra->ra_pages; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages); return max_pages; } void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { pgoff_t index = readahead_index(ractl); bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); struct file_ra_state *ra = ractl->ra; unsigned long max_pages, contig_count; pgoff_t prev_index, miss; /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; do_forced_ra = true; } /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); return; } max_pages = ractl_max_pages(ractl, req_count); prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; /* * A start of file, oversized read, or sequential cache miss: * trivial case: (index - prev_index) == 1 * unaligned reads: (index - prev_index) == 0 */ if (!index || req_count > max_pages || index - prev_index <= 1UL) { ra->start = index; ra->size = get_init_ra_size(req_count, max_pages); ra->async_size = ra->size > req_count ? ra->size - req_count : ra->size >> 1; goto readit; } /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ rcu_read_lock(); miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages); rcu_read_unlock(); contig_count = index - miss - 1; /* * Standalone, small random read. Read as is, and do not pollute the * readahead state. */ if (contig_count <= req_count) { do_page_cache_ra(ractl, req_count, 0); return; } /* * File cached from the beginning: * it is a strong indication of long-run stream (or whole-file-read) */ if (miss == ULONG_MAX) contig_count *= 2; ra->start = index; ra->size = min(contig_count + req_count, max_pages); ra->async_size = 1; readit: ractl->_index = ra->start; page_cache_ra_order(ractl, ra, 0); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { unsigned long max_pages; struct file_ra_state *ra = ractl->ra; pgoff_t index = readahead_index(ractl); pgoff_t expected, start; unsigned int order = folio_order(folio); /* no readahead */ if (!ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (folio_test_writeback(folio)) return; folio_clear_readahead(folio); if (blk_cgroup_congested()) return; max_pages = ractl_max_pages(ractl, req_count); /* * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order); if (index == expected) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } /* * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_count; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; readit: ractl->_index = ra->start; page_cache_ra_order(ractl, ra, order); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct fd f; ret = -EBADF; f = fdget(fd); if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ ret = -EINVAL; if (!f.file->f_mapping || !f.file->f_mapping->a_ops || (!S_ISREG(file_inode(f.file)->i_mode) && !S_ISBLK(file_inode(f.file)->i_mode))) goto out; ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); out: fdput(f); return ret; } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) { return ksys_readahead(fd, compat_arg_u64_glue(offset), count); } #endif /** * readahead_expand - Expand a readahead request * @ractl: The request to be expanded * @new_start: The revised start * @new_len: The revised size of the request * * Attempt to expand a readahead request outwards from the current size to the * specified size by inserting locked pages before and after the current window * to increase the size to the new window. This may involve the insertion of * THPs, in which case the window may get expanded even beyond what was * requested. * * The algorithm will stop if it encounters a conflicting page already in the * pagecache and leave a smaller expansion than requested. * * The caller must check for this by examining the revised @ractl object for a * different expansion than was requested. */ void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); new_index = new_start / PAGE_SIZE; /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) return; if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, 0); if (!folio) return; if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages++; if (ra) { ra->size++; ra->async_size++; } } } EXPORT_SYMBOL(readahead_expand);
2 304 158 175 73 5 57 1 2 9 58 113 27 27 61 1 99 115 55 103 113 55 24 95 49 9 49 49 49 49 26 3 29 96 95 26 96 1 26 95 2 292 292 292 161 172 32 87 112 2 113 2 115 115 115 28 2 94 240 41 291 61 292 205 75 76 16 54 54 53 37 15 16 54 9 75 30 46 17 64 64 41 41 114 113 75 44 40 60 64 41 82 81 49 51 32 2 34 34 30 15 41 131 131 94 17 11 10 36 21 36 74 22 90 13 87 20 24 24 23 6 9 6 6 4 11 5 1 6 4 4 3 1 2 1 2 19 5 19 11 6 3 5 2 3 4 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/indirect.c * * from * * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Goal-directed block allocation by Stephen Tweedie * (sct@redhat.com), 1993, 1998 */ #include "ext4_jbd2.h" #include "truncate.h" #include <linux/dax.h> #include <linux/uio.h> #include <trace/events/ext4.h> typedef struct { __le32 *p; __le32 key; struct buffer_head *bh; } Indirect; static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) { p->key = *(p->p = v); p->bh = bh; } /** * ext4_block_to_path - parse the block number into array of offsets * @inode: inode in question (we are only interested in its superblock) * @i_block: block number to be parsed * @offsets: array to store the offsets in * @boundary: set this non-zero if the referred-to block is likely to be * followed (on disk) by an indirect block. * * To store the locations of file's data ext4 uses a data structure common * for UNIX filesystems - tree of pointers anchored in the inode, with * data blocks at leaves and indirect blocks in intermediate nodes. * This function translates the block number into path in that tree - * return value is the path length and @offsets[n] is the offset of * pointer to (n+1)th node in the nth one. If @block is out of range * (negative or too large) warning is printed and zero returned. * * Note: function doesn't find node addresses, so no IO is needed. All * we need to know is the capacity of indirect blocks (taken from the * inode->i_sb). */ /* * Portability note: the last comparison (check that we fit into triple * indirect block) is spelled differently, because otherwise on an * architecture with 32-bit longs and 8Kb pages we might get into trouble * if our filesystem had 8Kb blocks. We might use long long, but that would * kill us on x86. Oh, well, at least the sign propagation does not matter - * i_block would have to be negative in the very beginning, so we would not * get there at all. */ static int ext4_block_to_path(struct inode *inode, ext4_lblk_t i_block, ext4_lblk_t offsets[4], int *boundary) { int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); const long direct_blocks = EXT4_NDIR_BLOCKS, indirect_blocks = ptrs, double_blocks = (1 << (ptrs_bits * 2)); int n = 0; int final = 0; if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = EXT4_IND_BLOCK; offsets[n++] = i_block; final = ptrs; } else if ((i_block -= indirect_blocks) < double_blocks) { offsets[n++] = EXT4_DIND_BLOCK; offsets[n++] = i_block >> ptrs_bits; offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { offsets[n++] = EXT4_TIND_BLOCK; offsets[n++] = i_block >> (ptrs_bits * 2); offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { ext4_warning(inode->i_sb, "block %lu > max in inode %lu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); return n; } /** * ext4_get_branch - read the chain of indirect blocks leading to data * @inode: inode in question * @depth: depth of the chain (1 - direct pointer, etc.) * @offsets: offsets of pointers in inode/indirect blocks * @chain: place to store the result * @err: here we store the error value * * Function fills the array of triples <key, p, bh> and returns %NULL * if everything went OK or the pointer to the last filled triple * (incomplete one) otherwise. Upon the return chain[i].key contains * the number of (i+1)-th block in the chain (as it is stored in memory, * i.e. little-endian 32-bit), chain[i].p contains the address of that * number (it points into struct inode for i==0 and into the bh->b_data * for i>0) and chain[i].bh points to the buffer_head of i-th indirect * block for i>0 and NULL for i==0. In other words, it holds the block * numbers of the chain, addresses they were taken from (and where we can * verify that chain did not change) and buffer_heads hosting these * numbers. * * Function stops when it stumbles upon zero pointer (absent block) * (pointer to last triple returned, *@err == 0) * or when it gets an IO error reading an indirect block * (ditto, *@err == -EIO) * or when it reads all @depth-1 indirect blocks successfully and finds * the whole chain, all way to the data (returns %NULL, *err == 0). * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) */ static Indirect *ext4_get_branch(struct inode *inode, int depth, ext4_lblk_t *offsets, Indirect chain[4], int *err) { struct super_block *sb = inode->i_sb; Indirect *p = chain; struct buffer_head *bh; unsigned int key; int ret = -EIO; *err = 0; /* i_data is not going away, no lock needed */ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); if (!p->key) goto no_block; while (--depth) { key = le32_to_cpu(p->key); if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) { /* the block was out of range */ ret = -EFSCORRUPTED; goto failure; } bh = sb_getblk(sb, key); if (unlikely(!bh)) { ret = -ENOMEM; goto failure; } if (!bh_uptodate_or_lock(bh)) { if (ext4_read_bh(bh, 0, NULL) < 0) { put_bh(bh); goto failure; } /* validate block references */ if (ext4_check_indirect_blockref(inode, bh)) { put_bh(bh); goto failure; } } add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; } return NULL; failure: *err = ret; no_block: return p; } /** * ext4_find_near - find a place for allocation with sufficient locality * @inode: owner * @ind: descriptor of indirect block. * * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same * cylinder group. * * In the latter case we colour the starting block by the callers PID to * prevent it from clashing with concurrent allocations for a different inode * in the same block group. The PID is used here so that functionally related * files will be close-by on-disk. * * Caller must make sure that @ind is valid and will stay that way. */ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) { if (*p) return le32_to_cpu(*p); } /* No such thing, so let's try location of indirect block */ if (ind->bh) return ind->bh->b_blocknr; /* * It is going to be referred to from the inode itself? OK, just put it * into the same cylinder group then. */ return ext4_inode_to_goal_block(inode); } /** * ext4_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * * Normally this function find the preferred place for block allocation, * returns it. * Because this is only used for non-extent files, we limit the block nr * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { ext4_fsblk_t goal; /* * XXX need to get goal block from mballoc's data structures */ goal = ext4_find_near(inode, partial); goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; return goal; } /** * ext4_blks_to_allocate - Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * * @branch: chain of indirect blocks * @k: number of blocks need for indirect blocks * @blks: number of data blocks to be mapped. * @blocks_to_boundary: the offset in the indirect block * * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet * then it's clear blocks on that path have not allocated */ if (k > 0) { /* right now we don't handle cross boundary allocation */ if (blks < blocks_to_boundary + 1) count += blks; else count += blocks_to_boundary + 1; return count; } count++; while (count < blks && count <= blocks_to_boundary && le32_to_cpu(*(branch[0].p + count)) == 0) { count++; } return count; } /** * ext4_alloc_branch() - allocate and set up a chain of blocks * @handle: handle for this transaction * @ar: structure describing the allocation request * @indirect_blks: number of allocated indirect blocks * @offsets: offsets (in the blocks) to store the pointers to next. * @branch: place to store the chain in. * * This function allocates blocks, zeroes out all but the last one, * links them into chain and (if we are synchronous) writes them to disk. * In other words, it prepares a branch that can be spliced onto the * inode. It stores the information about that chain in the branch[], in * the same format as ext4_get_branch() would do. We are calling it after * we had read the existing part of chain and partial points to the last * triple of that (one with zero ->key). Upon the exit we have the same * picture as after the successful ext4_get_block(), except that in one * place chain is disconnected - *branch->p is still zero (we did not * set the last link), but branch->key contains the number that should * be placed into *branch->p to fill that gap. * * If allocation fails we free all blocks we've allocated (and forget * their buffer_heads) and return the error value the from failed * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct ext4_allocation_request *ar, int indirect_blks, ext4_lblk_t *offsets, Indirect *branch) { struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else { ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, ar->inode, ar->goal, ar->flags & EXT4_MB_DELALLOC_RESERVED, NULL, &err); /* Simplify error cleanup... */ branch[i+1].bh = NULL; } if (err) { i--; goto failed; } branch[i].key = cpu_to_le32(new_blocks[i]); if (i == 0) continue; bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, ar->inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto failed; } memset(bh->b_data, 0, bh->b_size); p = branch[i].p = (__le32 *) bh->b_data + offsets[i]; b = new_blocks[i]; if (i == indirect_blks) len = ar->len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); BUFFER_TRACE(bh, "marking uptodate"); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, bh); if (err) goto failed; } return 0; failed: if (i == indirect_blks) { /* Free data blocks */ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], ar->len, 0); i--; } for (; i >= 0; i--) { /* * We want to ext4_forget() only freshly allocated indirect * blocks. Buffer for new_blocks[i] is at branch[i+1].bh * (buffer at branch[0].bh is indirect block / inode already * existing before ext4_alloc_branch() was called). Also * because blocks are freshly allocated, we don't need to * revoke them which is why we don't set * EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, branch[i+1].bh, new_blocks[i], 1, branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0); } return err; } /** * ext4_splice_branch() - splice the allocated branch onto inode. * @handle: handle for this transaction * @ar: structure describing the allocation request * @where: location of missing link * @num: number of indirect blocks we are adding * * This function fills the missing link and does all housekeeping needed in * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ static int ext4_splice_branch(handle_t *handle, struct ext4_allocation_request *ar, Indirect *where, int num) { int i; int err = 0; ext4_fsblk_t current_block; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block * before the splice. */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, ar->inode->i_sb, where->bh, EXT4_JTR_NONE); if (err) goto err_out; } /* That's it */ *where->p = where->key; /* * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ if (num == 0 && ar->len > 1) { current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < ar->len; i++) *(where->p + i) = cpu_to_le32(current_block++); } /* We are done with atomic stuff, now do the rest of housekeeping */ /* had we spliced it onto indirect block? */ if (where->bh) { /* * If we spliced it onto an indirect block, we haven't * altered the inode. Note however that if it is being spliced * onto an indirect block at the very end of the file (the * file is growing) then we *will* alter the inode to reflect * the new i_size. But that is not done here - it is done in * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ ext4_debug("splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ err = ext4_mark_inode_dirty(handle, ar->inode); if (unlikely(err)) goto err_out; ext4_debug("splicing direct\n"); } return err; err_out: for (i = 1; i <= num; i++) { /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), ar->len, 0); return err; } /* * The ext4_ind_map_blocks() function handles non-extents inodes * (i.e., using the traditional indirect/double-indirect i_blocks * scheme) for ext4_map_blocks(). * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is * required, recheck the path, free and repeat if check fails, otherwise * set the last missing link (that will protect us from any truncate-generated * removals - all blocks on the path are immune now) and possibly force the * write on the parent block. * That has a nice additional property: no special recovery from the failed * allocations is needed - we simply release blocks and do not touch anything * reachable from inode. * * `handle' can be NULL if create == 0. * * return > 0, # of blocks mapped or allocated. * return = 0, if plain lookup failed. * return < 0, error case. * * The ext4_ind_get_blocks() function should be called with * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system * blocks. */ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_allocation_request ar; int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; int indirect_blks; int blocks_to_boundary = 0; int depth; int count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); if (depth == 0) goto out; partial = ext4_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); count++; /*map more blocks*/ while (count < map->m_len && count <= blocks_to_boundary) { ext4_fsblk_t blk; blk = le32_to_cpu(*(chain[depth-1].p + count)); if (blk == first_block + count) count++; else break; } goto got_it; } /* Next simple case - plain lookup failed */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; /* * Count number blocks in a subtree under 'partial'. At each * level we count number of complete empty subtrees beyond * current offset and then descend into the subtree only * partially beyond current offset. */ count = 0; for (i = partial - chain + 1; i < depth; i++) count = count * epb + (epb - offsets[i] - 1); count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); goto cleanup; } /* Failed read of indirect block */ if (err == -EIO) goto cleanup; /* * Okay, we need to do block allocation. */ if (ext4_has_feature_bigalloc(inode->i_sb)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); err = -EFSCORRUPTED; goto out; } /* Set up for the direct block allocation */ memset(&ar, 0, sizeof(ar)); ar.inode = inode; ar.logical = map->m_lblk; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; /* * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ ar.len = ext4_blks_to_allocate(partial, indirect_blks, map->m_len, blocks_to_boundary); /* * Block out ext4_truncate while we alter the tree */ err = ext4_alloc_branch(handle, &ar, indirect_blks, offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers * on the new chain if there is a failure, but that risks using * up transaction credits, especially for bitmaps where the * credits cannot be returned. Can we handle this somehow? We * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) err = ext4_splice_branch(handle, &ar, partial, indirect_blks); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; /* * Update reserved blocks/metadata blocks after successful block * allocation which had been deferred till now. */ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ext4_da_update_reserve_space(inode, count, 1); got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); map->m_len = count; if (count > blocks_to_boundary) map->m_flags |= EXT4_MAP_BOUNDARY; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ cleanup: while (partial > chain) { BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } out: trace_ext4_ind_map_blocks_exit(inode, flags, map, err); return err; } /* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks) { /* * With N contiguous data blocks, we need at most * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, * 2 dindirect blocks, and 1 tindirect block */ return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, int *dropped) { int err; if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) return err; } err = ext4_mark_inode_dirty(handle, inode); if (unlikely(err)) return err; /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } /* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a convenient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If * extend fails, we restart transaction. */ static int ext4_ind_truncate_ensure_credits(handle_t *handle, struct inode *inode, struct buffer_head *bh, int revoke_creds) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_blocks_for_truncate(inode), revoke_creds, ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); if (ret <= 0) return ret; if (bh) { BUFFER_TRACE(bh, "retaking write access"); ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(ret)) return ret; } return 0; } /* * Probably it should be a library function... search for first non-zero word * or memcmp with zero_page, whatever is better for particular architecture. * Linus? */ static inline int all_zeroes(__le32 *p, __le32 *q) { while (p < q) if (*p++) return 0; return 1; } /** * ext4_find_shared - find the indirect blocks for partial truncation. * @inode: inode in question * @depth: depth of the affected branch * @offsets: offsets of pointers in that branch (see ext4_block_to_path) * @chain: place to store the pointers to partial indirect blocks * @top: place to the (detached) top of branch * * This is a helper function used by ext4_truncate(). * * When we do truncate() we may have to clean the ends of several * indirect blocks but leave the blocks themselves alive. Block is * partially truncated if some data below the new i_size is referred * from it (and it is on the path to the first completely truncated * data block, indeed). We have to free the top of that path along * with everything to the right of the path. Since no allocation * past the truncation point is possible until ext4_truncate() * finishes, we may safely do the latter, but top of branch may * require special attention - pageout below the truncation point * might try to populate it. * * We atomically detach the top of branch from the tree, store the * block number of its root in *@top, pointers to buffer_heads of * partially truncated blocks - in @chain[].bh and pointers to * their last elements that should not be removed - in * @chain[].p. Return value is the pointer to last filled element * of @chain. * * The work left to caller to do the actual freeing of subtrees: * a) free the subtree starting from *@top * b) free the subtrees whose roots are stored in * (@chain[i].p+1 .. end of @chain[i].bh->b_data) * c) free the subtrees growing from the inode past the @chain[0]. * (no partially truncated stuff there). */ static Indirect *ext4_find_shared(struct inode *inode, int depth, ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) { Indirect *partial, *p; int k, err; *top = 0; /* Make k index the deepest non-null offset + 1 */ for (k = depth; k > 1 && !offsets[k-1]; k--) ; partial = ext4_get_branch(inode, k, offsets, chain, &err); /* Writer: pointers */ if (!partial) partial = chain + k-1; /* * If the branch acquired continuation since we've looked at it - * fine, it should all survive and (new) top doesn't belong to us. */ if (!partial->key && *partial->p) /* Writer: end */ goto no_top; for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our * branch should be detached before unlocking. However, if that rest * of branch is all ours and does not grow immediately from the inode * it's easier to cheat and just decrement partial->p. */ if (p == chain + k - 1 && p > chain) { p->p--; } else { *top = *p->p; /* Nope, don't do this in ext4. Must leave the tree intact */ #if 0 *p->p = 0; #endif } /* Writer: end */ while (partial > p) { brelse(partial->bh); partial--; } no_top: return partial; } /* * Zero a number of block pointers in either an inode or an indirect block. * If we restart the transaction we must again get write access to the * indirect block for further modification. * * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. * * Return 0 on success, 1 on invalid block range * and < 0 on fatal error. */ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block_to_free, unsigned long count, __le32 *first, __le32 *last) { __le32 *p; int flags = EXT4_FREE_BLOCKS_VALIDATED; int err; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA; else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET; if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); return 1; } err = ext4_ind_truncate_ensure_credits(handle, inode, bh, ext4_free_data_revoke_credits(inode, count)); if (err < 0) goto out_err; for (p = first; p < last; p++) *p = 0; ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); return 0; out_err: ext4_std_error(inode->i_sb, err); return err; } /** * ext4_free_data - free a list of data blocks * @handle: handle for this transaction * @inode: inode we are dealing with * @this_bh: indirect buffer_head which contains *@first and *@last * @first: array of block numbers * @last: points immediately past the end of array * * We are freeing all blocks referred from that array (numbers are stored as * little-endian 32-bit) and updating @inode->i_blocks appropriately. * * We accumulate contiguous runs of blocks to free. Conveniently, if these * blocks are contiguous then releasing them at one time will only affect one * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't * actually use a lot of journal space. * * @this_bh will be %NULL if @first and @last point into the inode's direct * block pointers. */ static void ext4_free_data(handle_t *handle, struct inode *inode, struct buffer_head *this_bh, __le32 *first, __le32 *last) { ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ ext4_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ int err = 0; if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, this_bh, EXT4_JTR_NONE); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) return; } for (p = first; p < last; p++) { nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ if (count == 0) { block_to_free = nr; block_to_free_p = p; count = 1; } else if (nr == block_to_free + count) { count++; } else { err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err) break; block_to_free = nr; block_to_free_p = p; count = 1; } } } if (!err && count > 0) err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); if (err < 0) /* fatal error */ return; if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this * point. However, if the data is corrupted and an indirect * block pointed to itself, it would have been detached when * the block was cleared. Check for this instead of OOPSing. */ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else EXT4_ERROR_INODE(inode, "circular indirect block detected at " "block %llu", (unsigned long long) this_bh->b_blocknr); } } /** * ext4_free_branches - free an array of branches * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @parent_bh: the buffer_head which contains *@first and *@last * @first: array of block numbers * @last: pointer immediately past the end of array * @depth: depth of the branches to free * * We are freeing all blocks referred from these branches (numbers are * stored as little-endian 32-bit) and updating @inode->i_blocks * appropriately. */ static void ext4_free_branches(handle_t *handle, struct inode *inode, struct buffer_head *parent_bh, __le32 *first, __le32 *last, int depth) { ext4_fsblk_t nr; __le32 *p; if (ext4_handle_is_aborted(handle)) return; if (depth--) { struct buffer_head *bh; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); p = last; while (--p >= first) { nr = le32_to_cpu(*p); if (!nr) continue; /* A hole */ if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", (unsigned long) nr, depth); break; } /* Go read the buffer for the next level down */ bh = ext4_sb_bread(inode->i_sb, nr, 0); /* * A read failure? Report error and clear slot * (should be rare). */ if (IS_ERR(bh)) { ext4_error_inode_block(inode, nr, -PTR_ERR(bh), "Read failure"); continue; } /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); ext4_free_branches(handle, inode, bh, (__le32 *) bh->b_data, (__le32 *) bh->b_data + addr_per_block, depth); brelse(bh); /* * Everything below this pointer has been * released. Now let this top-of-subtree go. * * We want the freeing of this indirect block to be * atomic in the journal with the updating of the * bitmap block which owns it. So make some room in * the journal. * * We zero the parent pointer *after* freeing its * pointee in the bitmaps, so if extend_transaction() * for some reason fails to put the bitmap changes and * the release into the same transaction, recovery * will merely complain about releasing a free block, * rather than leaking blocks. */ if (ext4_handle_is_aborted(handle)) return; if (ext4_ind_truncate_ensure_credits(handle, inode, NULL, ext4_free_metadata_revoke_credits( inode->i_sb, 1)) < 0) return; /* * The forget flag here is critical because if * we are journaling (and not doing data * journaling), we have to make sure a revoke * record is written to prevent the journal * replay from overwriting the (former) * indirect block if it gets reallocated as a * data block. This must happen in the same * transaction where the data blocks are * actually freed. */ ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA| EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* * The block which we have just freed is * pointed to by an indirect block: journal it */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, inode->i_sb, parent_bh, EXT4_JTR_NONE)) { *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); ext4_handle_dirty_metadata(handle, inode, parent_bh); } } } } else { /* We have reached the bottom of the tree. */ BUFFER_TRACE(parent_bh, "free data blocks"); ext4_free_data(handle, inode, parent_bh, first, last); } } void ext4_ind_truncate(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; __le32 nr = 0; int n = 0; ext4_lblk_t last_block, max_block; unsigned blocksize = inode->i_sb->s_blocksize; last_block = (inode->i_size + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (last_block != max_block) { n = ext4_block_to_path(inode, last_block, offsets, NULL); if (n == 0) return; } ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate * the new, shorter inode size (held for now in i_size) into the * on-disk inode. We do this via i_disksize, which is the value which * ext4 *really* writes onto the disk inode. */ ei->i_disksize = inode->i_size; if (last_block == max_block) { /* * It is unnecessary to free any data blocks if last_block is * equal to the indirect block limit. */ return; } else if (n == 1) { /* direct blocks */ ext4_free_data(handle, inode, NULL, i_data+offsets[0], i_data + EXT4_NDIR_BLOCKS); goto do_indirects; } partial = ext4_find_shared(inode, n, offsets, chain, &nr); /* Kill the top of shared branch (not detached) */ if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; /* * We mark the inode dirty prior to restart, * and prior to stop. No need for it here. */ } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* Clear the ends of indirect blocks on the shared branch */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32*)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); partial--; } do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } } /** * ext4_ind_remove_space - remove space from the range * @handle: JBD handle for this transaction * @inode: inode we are dealing with * @start: First block to remove * @end: One block after the last block to remove (exclusive) * * Free the blocks in the defined range (end is exclusive endpoint of * range). This is used by ext4_punch_hole(). */ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_inode_info *ei = EXT4_I(inode); __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; unsigned blocksize = inode->i_sb->s_blocksize; max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); if (end >= max_block) end = max_block; if ((start >= end) || (start > max_block)) return 0; n = ext4_block_to_path(inode, start, offsets, NULL); n2 = ext4_block_to_path(inode, end, offsets2, NULL); BUG_ON(n > n2); if ((n == 1) && (n == n2)) { /* We're punching only within direct block range */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + offsets2[0]); return 0; } else if (n2 > n) { /* * Start and end are on a different levels so we're going to * free partial block at start, and partial block at end of * the range. If there are some levels in between then * do_indirects label will take care of that. */ if (n == 1) { /* * Start is at the direct block level, free * everything to the end of the level. */ ext4_free_data(handle, inode, NULL, i_data + offsets[0], i_data + EXT4_NDIR_BLOCKS); goto end_range; } partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } /* * Clear the ends of indirect blocks on the shared branch * at the start of the range */ while (partial > chain) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } end_range: partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* * Remember, end is exclusive so here we're at * the start of the next level we're not going * to free. Everything was covered by the start * of the range. */ goto do_indirects; } } else { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } /* * Clear the ends of indirect blocks on the shared branch * at the end of the range */ while (partial2 > chain2) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { int level = min(partial - chain, partial2 - chain2); int i; int subtree = 1; for (i = 0; i <= level; i++) { if (offsets[i] != offsets2[i]) { subtree = 0; break; } } if (!subtree) { if (partial == chain) { /* Shared branch grows from the inode */ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, (chain+n-1) - partial); *partial->p = 0; } else { /* Shared branch grows from an indirect block */ BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, partial->p, partial->p+1, (chain+n-1) - partial); } } } if (!nr2) { /* * ext4_find_shared returns Indirect structure which * points to the last element which should not be * removed by truncate. But this is end of the range * in punch_hole so we need to point to the next element */ partial2->p++; } while (partial > chain || partial2 > chain2) { int depth = (chain+n-1) - partial; int depth2 = (chain2+n2-1) - partial2; if (partial > chain && partial2 > chain2 && partial->bh->b_blocknr == partial2->bh->b_blocknr) { /* * We've converged on the same block. Clear the range, * then we're done. */ ext4_free_branches(handle, inode, partial->bh, partial->p + 1, partial2->p, (chain+n-1) - partial); goto cleanup; } /* * The start and end partial branches may not be at the same * level even though the punch happened within one level. So, we * give them a chance to arrive at the same level, then walk * them in step with each other until we converge on the same * block. */ if (partial > chain && depth <= depth2) { ext4_free_branches(handle, inode, partial->bh, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); partial--; } if (partial2 > chain2 && depth2 <= depth) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); partial2--; } } cleanup: while (p && p > chain) { BUFFER_TRACE(p->bh, "call brelse"); brelse(p->bh); p--; } while (p2 && p2 > chain2) { BUFFER_TRACE(p2->bh, "call brelse"); brelse(p2->bh); p2--; } return 0; do_indirects: /* Kill the remaining (whole) subtrees */ switch (offsets[0]) { default: if (++n >= n2) break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } fallthrough; case EXT4_IND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } fallthrough; case EXT4_DIND_BLOCK: if (++n >= n2) break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } fallthrough; case EXT4_TIND_BLOCK: ; } goto cleanup; }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/signal.h> #include <linux/iversion.h> #include <linux/ktime.h> #include <linux/netfs.h> #include "super.h" #include "mds_client.h" #include "cache.h" #include "metric.h" #include "crypto.h" #include <linux/ceph/osd_client.h> #include <linux/ceph/striper.h> /* * Ceph address space ops. * * There are a few funny things going on here. * * The page->private field is used to reference a struct * ceph_snap_context for _every_ dirty page. This indicates which * snapshot the page was logically dirtied in, and thus which snap * context needs to be associated with the osd write during writeback. * * Similarly, struct ceph_inode_info maintains a set of counters to * count dirty pages on the inode. In the absence of snapshots, * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. * * When a snapshot is taken (that is, when the client receives * notification that a snapshot was taken), each inode with caps and * with dirty pages (dirty pages implies there is a cap) gets a new * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending * order, new snaps go to the tail). The i_wrbuffer_ref_head count is * moved to capsnap->dirty. (Unless a sync write is currently in * progress. In that case, the capsnap is said to be "pending", new * writes cannot start, and the capsnap isn't "finalized" until the * write completes (or fails) and a final size/mtime for the inode for * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. * * On writeback, we must submit writes to the osd IN SNAP ORDER. So, * we look for the first capsnap in i_cap_snaps and write out pages in * that snap context _only_. Then we move on to the next capsnap, * eventually reaching the "live" or "head" context (i.e., pages that * are not yet snapped) and are writing the most recently dirtied * pages. * * Invalidate and so forth must take care to ensure the dirty page * accounting is preserved. */ #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) #define CONGESTION_OFF_THRESH(congestion_kb) \ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { if (PagePrivate(page)) return (void *)page->private; return NULL; } /* * Dirty a page. Optimistically adjust accounting, on the assumption * that we won't race with invalidate. If we do, readjust. */ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci; struct ceph_snap_context *snapc; if (folio_test_dirty(folio)) { doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n", ceph_vinop(inode), folio, folio->index); VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } ci = ceph_inode(inode); /* dirty the head */ spin_lock(&ci->i_ceph_lock); BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference if (__ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); snapc = ceph_get_snap_context(capsnap->context); capsnap->dirty_pages++; } else { BUG_ON(!ci->i_head_snapc); snapc = ceph_get_snap_context(ci->i_head_snapc); ++ci->i_wrbuffer_ref_head; } if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d " "snapc %p seq %lld (%d snaps)\n", ceph_vinop(inode), folio, folio->index, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); /* * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ VM_WARN_ON_FOLIO(folio->private, folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); } /* * If we are truncating the full folio (i.e. offset == 0), adjust the * dirty folio counters appropriately. Only called if there is private * data on the folio. */ static void ceph_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct inode *inode = folio->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; if (offset != 0 || length != folio_size(folio)) { doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n", ceph_vinop(inode), folio->index, offset, length); return; } WARN_ON(!folio_test_locked(folio)); if (folio_test_private(folio)) { doutc(cl, "%llx.%llx idx %lu full dirty page\n", ceph_vinop(inode), folio->index); snapc = folio_detach_private(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); } netfs_invalidate_folio(folio, offset, length); } static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) { struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; unsigned long max_pages = inode->i_sb->s_bdi->ra_pages; loff_t end = rreq->start + rreq->len, new_end; struct ceph_netfs_request_data *priv = rreq->netfs_priv; unsigned long max_len; u32 blockoff; if (priv) { /* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */ if (priv->file_ra_disabled) max_pages = 0; else max_pages = priv->file_ra_pages; } /* Readahead is disabled */ if (!max_pages) return; max_len = max_pages << PAGE_SHIFT; /* * Try to expand the length forward by rounding up it to the next * block, but do not exceed the file size, unless the original * request already exceeds it. */ new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size); if (new_end > end && new_end <= rreq->start + max_len) rreq->len = new_end - rreq->start; /* Try to expand the start downward */ div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); if (rreq->len + blockoff <= max_len) { rreq->start -= blockoff; rreq->len += blockoff; } } static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) { struct inode *inode = subreq->rreq->inode; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; u32 xlen; /* Truncate the extent at the end of the current block */ ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, &objno, &objoff, &xlen); subreq->len = min(xlen, fsc->mount_options->rsize); return true; } static void finish_netfs_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct netfs_io_subrequest *subreq = req->r_priv; struct ceph_osd_req_op *op = &req->r_ops[0]; int err = req->r_result; bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, osd_data->length, err); doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result, subreq->len, i_size_read(req->r_inode)); /* no object means success but no data */ if (err == -ENOENT) err = 0; else if (err == -EBLOCKLISTED) fsc->blocklisted = true; if (err >= 0) { if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); if (err < subreq->len && subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { err = ceph_fscrypt_decrypt_extents(inode, osd_data->pages, subreq->start, op->extent.sparse_ext, op->extent.sparse_ext_cnt); if (err > subreq->len) err = subreq->len; } } if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { ceph_put_page_vector(osd_data->pages, calc_pages_for(osd_data->alignment, osd_data->length), false); } netfs_subreq_terminated(subreq, err, false); iput(req->r_inode); ceph_dec_osd_stopping_blocker(fsc->mdsc); } static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct inode *inode = rreq->inode; struct ceph_mds_reply_info_parsed *rinfo; struct ceph_mds_reply_info_in *iinfo; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); struct iov_iter iter; ssize_t err = 0; size_t len; int mode; if (rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); if (subreq->start >= inode->i_size) goto out; /* We need to fetch the inline data. */ mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } req->r_ino1 = ci->i_vino; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) goto out; rinfo = &req->r_reply_info; iinfo = &rinfo->targeti; if (iinfo->inline_version == CEPH_INLINE_NONE) { /* The data got uninlined */ ceph_mdsc_put_request(req); return false; } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); if (err == 0) err = -EFAULT; ceph_mdsc_put_request(req); out: netfs_subreq_terminated(subreq, err, false); return true; } static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; int err = 0; u64 len = subreq->len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; int extent_cnt; if (ceph_inode_is_shutdown(inode)) { err = -EIO; goto out; } if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; ceph_fscrypt_adjust_off_and_len(inode, &off, &len); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; goto out; } if (sparse) { extent_cnt = __ceph_sparse_read_ext_count(inode, len); err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt); if (err) goto out; } doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", ceph_vinop(inode), subreq->start, subreq->len, len); iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); /* * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for * encrypted inodes. We'd need infrastructure that handles an iov_iter * instead of page arrays, and we don't have that as of yet. Once the * dust settles on the write helpers and encrypt/decrypt routines for * netfs, we should be able to rework this. */ if (IS_ENCRYPTED(inode)) { struct page **pages; size_t page_off; err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); if (err < 0) { doutc(cl, "%llx.%llx failed to allocate pages, %d\n", ceph_vinop(inode), err); goto out; } /* should always give us a page-aligned read */ WARN_ON_ONCE(page_off); len = err; err = 0; osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); } else { osd_req_op_extent_osd_iter(req, 0, &iter); } if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { err = -EIO; goto out; } req->r_callback = finish_netfs_read; req->r_priv = subreq; req->r_inode = inode; ihold(inode); ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); if (err) netfs_subreq_terminated(subreq, err, false); doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { struct inode *inode = rreq->inode; struct ceph_client *cl = ceph_inode_to_client(inode); int got = 0, want = CEPH_CAP_FILE_CACHE; struct ceph_netfs_request_data *priv; int ret = 0; /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); if (rreq->origin != NETFS_READAHEAD) return 0; priv = kzalloc(sizeof(*priv), GFP_NOFS); if (!priv) return -ENOMEM; if (file) { struct ceph_rw_context *rw_ctx; struct ceph_file_info *fi = file->private_data; priv->file_ra_pages = file->f_ra.ra_pages; priv->file_ra_disabled = file->f_mode & FMODE_RANDOM; rw_ctx = ceph_find_rw_context(fi); if (rw_ctx) { rreq->netfs_priv = priv; return 0; } } /* * readahead callers do not necessarily hold Fcb caps * (e.g. fadvise, madvise). */ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); if (ret < 0) { doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode)); goto out; } if (!(got & want)) { doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode)); ret = -EACCES; goto out; } if (ret == 0) { ret = -EACCES; goto out; } priv->caps = got; rreq->netfs_priv = priv; out: if (ret < 0) kfree(priv); return ret; } static void ceph_netfs_free_request(struct netfs_io_request *rreq) { struct ceph_netfs_request_data *priv = rreq->netfs_priv; if (!priv) return; if (priv->caps) ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps); kfree(priv); rreq->netfs_priv = NULL; } const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, }; #ifdef CONFIG_CEPH_FSCACHE static void ceph_set_page_fscache(struct page *page) { folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ } static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) { struct inode *inode = priv; if (IS_ERR_VALUE(error) && error != -ENOBUFS) ceph_fscache_invalidate(inode, false); } static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { struct ceph_inode_info *ci = ceph_inode(inode); struct fscache_cookie *cookie = ceph_fscache_cookie(ci); fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode), ceph_fscache_write_terminated, inode, true, caching); } #else static inline void ceph_set_page_fscache(struct page *page) { } static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching) { } #endif /* CONFIG_CEPH_FSCACHE */ struct ceph_writeback_ctl { loff_t i_size; u64 truncate_size; u32 truncate_seq; bool size_stable; bool head_snapc; }; /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. */ static struct ceph_snap_context * get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { doutc(cl, " capsnap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); if (!capsnap->dirty_pages) continue; /* get i_size, truncate_{seq,size} for page_snapc? */ if (snapc && capsnap->context != page_snapc) continue; if (ctl) { if (capsnap->writing) { ctl->i_size = i_size_read(inode); ctl->size_stable = false; } else { ctl->i_size = capsnap->size; ctl->size_stable = true; } ctl->truncate_size = capsnap->truncate_size; ctl->truncate_seq = capsnap->truncate_seq; ctl->head_snapc = false; } if (snapc) break; snapc = ceph_get_snap_context(capsnap->context); if (!page_snapc || page_snapc == snapc || page_snapc->seq > snapc->seq) break; } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); doutc(cl, " head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); if (ctl) { ctl->i_size = i_size_read(inode); ctl->truncate_size = ci->i_truncate_size; ctl->truncate_seq = ci->i_truncate_seq; ctl->size_stable = false; ctl->head_snapc = true; } } spin_unlock(&ci->i_ceph_lock); return snapc; } static u64 get_writepages_data_length(struct inode *inode, struct page *page, u64 start) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; struct ceph_cap_snap *capsnap = NULL; u64 end = i_size_read(inode); u64 ret; snapc = page_snap_context(ceph_fscrypt_pagecache_page(page)); if (snapc != ci->i_head_snapc) { bool found = false; spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->context == snapc) { if (!capsnap->writing) end = capsnap->size; found = true; break; } } spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } if (end > ceph_fscrypt_page_offset(page) + thp_size(page)) end = ceph_fscrypt_page_offset(page) + thp_size(page); ret = end > start ? end - start : 0; if (ret && fscrypt_is_bounce_page(page)) ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE); return ret; } /* * Write a single page, but leave the page locked. * * If we get a write error, mark the mapping for error, but still adjust the * dirty page accounting (i.e., page is no longer dirty). */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); int err; loff_t len = thp_size(page); loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; bool caching = ceph_is_cache_enabled(inode); struct page *bounce_page = NULL; doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, page->index); if (ceph_inode_is_shutdown(inode)) return -EIO; /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), page); return 0; } oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", ceph_vinop(inode), page, snapc); /* we should only noop if called by kswapd */ WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); redirty_page_for_writepage(wbc, page); return 0; } ceph_put_snap_context(oldest); /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n", ceph_vinop(inode), folio->index, ceph_wbc.i_size); folio_invalidate(folio, 0, folio_size(folio)); return 0; } if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", ceph_vinop(inode), page, page->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = true; req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); if (IS_ERR(req)) { redirty_page_for_writepage(wbc, page); return PTR_ERR(req); } if (wlen < len) len = wlen; set_page_writeback(page); if (caching) ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { bounce_page = fscrypt_encrypt_pagecache_blocks(page, CEPH_FSCRYPT_BLOCK_SIZE, 0, GFP_NOFS); if (IS_ERR(bounce_page)) { redirty_page_for_writepage(wbc, page); end_page_writeback(page); ceph_osdc_put_request(req); return PTR_ERR(bounce_page); } } /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); osd_req_op_extent_osd_data_pages(req, 0, bounce_page ? &bounce_page : &page, wlen, 0, false, false); doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n", ceph_vinop(inode), page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not "); req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(osdc, req); err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); fscrypt_free_bounce_page(bounce_page); ceph_osdc_put_request(req); if (err == 0) err = len; if (err < 0) { struct writeback_control tmp_wbc; if (!wbc) wbc = &tmp_wbc; if (err == -ERESTARTSYS) { /* killed by SIGKILL */ doutc(cl, "%llx.%llx interrupted page %p\n", ceph_vinop(inode), page); redirty_page_for_writepage(wbc, page); end_page_writeback(page); return err; } if (err == -EBLOCKLISTED) fsc->blocklisted = true; doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", ceph_vinop(inode), err, page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { doutc(cl, "%llx.%llx cleaned page %p\n", ceph_vinop(inode), page); err = 0; /* vfs expects us to return 0 */ } oldest = detach_page_private(page); WARN_ON_ONCE(oldest != snapc); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = false; return err; } static int ceph_writepage(struct page *page, struct writeback_control *wbc) { int err; struct inode *inode = page->mapping->host; BUG_ON(!inode); ihold(inode); if (wbc->sync_mode == WB_SYNC_NONE && ceph_inode_to_fs_client(inode)->write_congested) { redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ err = writepage_nounlock(page, wbc); if (err == -ERESTARTSYS) { /* direct memory reclaimer was killed by SIGKILL. return 0 * to prevent caller from setting mapping/page error */ err = 0; } unlock_page(page); iput(inode); return err; } /* * async writeback completion handler. * * If we get an error, set the mapping error bit, but not the individual * page error bits. */ static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data; struct page *page; int num_pages, total_pages = 0; int i, j; int rc = req->r_result; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); unsigned int len = 0; bool remove_page; doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc); if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); if (rc == -EBLOCKLISTED) fsc->blocklisted = true; } else { ceph_clear_error_write(ci); } /* * We lost the cache cap, need to truncate the page before * it is unlocked, otherwise we'd truncate it later in the * page truncation thread, possibly losing some data that * raced its way in */ remove_page = !(ceph_caps_issued(ci) & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { pr_warn_client(cl, "%llx.%llx incorrect op %d req %p index %d tid %llu\n", ceph_vinop(inode), req->r_ops[i].op, req, i, req->r_tid); break; } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); len += osd_data->length; num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); total_pages += num_pages; for (j = 0; j < num_pages; j++) { page = osd_data->pages[j]; if (fscrypt_is_bounce_page(page)) { page = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(osd_data->pages[j]); osd_data->pages[j] = page; } BUG_ON(!page); WARN_ON(!PageUptodate(page)); if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) fsc->write_congested = false; ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); doutc(cl, "unlocking %p\n", page); if (remove_page) generic_error_remove_folio(inode->i_mapping, page_folio(page)); unlock_page(page); } doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n", ceph_vinop(inode), osd_data->length, rc >= 0 ? num_pages : 0); release_pages(osd_data->pages, num_pages); } ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, rc); ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); osd_data = osd_req_op_extent_osd_data(req, 0); if (osd_data->pages_from_pool) mempool_free(osd_data->pages, ceph_wb_pagevec_pool); else kfree(osd_data->pages); ceph_osdc_put_request(req); ceph_dec_osd_stopping_blocker(fsc->mdsc); } /* * initiate async writeback */ static int ceph_writepages_start(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct folio_batch fbatch; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; struct ceph_writeback_ctl ceph_wbc; bool should_loop, range_whole = false; bool done = false; bool caching = ceph_is_cache_enabled(inode); xa_mark_t tag; if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) return 0; doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited_client(cl, "%llx.%llx %lld forced umount\n", ceph_vinop(inode), ceph_ino(inode)); } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; folio_batch_init(&fbatch); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { tag = PAGECACHE_TAG_TOWRITE; } else { tag = PAGECACHE_TAG_DIRTY; } retry: /* find oldest snap context with dirty data */ snapc = get_oldest_context(inode, &ceph_wbc, NULL); if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ doutc(cl, " no snap context with dirty data?\n"); goto out; } doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); should_loop = false; if (ceph_wbc.head_snapc && snapc != last_snapc) { /* where to start/end? */ if (wbc->range_cyclic) { index = start_index; end = -1; if (index > 0) should_loop = true; doutc(cl, " cyclic, start at %lu\n", index); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = true; doutc(cl, " not cyclic, %lu to %lu\n", index, end); } } else if (!ceph_wbc.head_snapc) { /* Do not respect wbc->range_{start,end}. Dirty pages * in that range can be associated with newer snapc. * They are not writeable until we write all dirty pages * associated with 'snapc' get written */ if (index > 0) should_loop = true; doutc(cl, " non-head snapc, range whole\n"); } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); ceph_put_snap_context(last_snapc); last_snapc = snapc; while (!done && index <= end) { int num_ops = 0, op_idx; unsigned i, nr_folios, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; struct page *page; pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; bool from_pool = false; max_pages = wsize >> PAGE_SHIFT; get_more_pages: nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); if (!nr_folios && !locked_pages) break; for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { page = &fbatch.folios[i]->page; doutc(cl, "? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ else if (!trylock_page(page)) break; /* only dirty pages, or our accounting breaks */ if (unlikely(!PageDirty(page)) || unlikely(page->mapping != mapping)) { doutc(cl, "!dirty or !mapping %p\n", page); unlock_page(page); continue; } /* only if matching snap context */ pgsnapc = page_snap_context(page); if (pgsnapc != snapc) { doutc(cl, "page snapc %p %lld != oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); if (!should_loop && !ceph_wbc.head_snapc && wbc->sync_mode != WB_SYNC_NONE) should_loop = true; unlock_page(page); continue; } if (page_offset(page) >= ceph_wbc.i_size) { struct folio *folio = page_folio(page); doutc(cl, "folio at %lu beyond eof %llu\n", folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || folio_pos(folio) >= i_size_read(inode)) && folio_clear_dirty_for_io(folio)) folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); continue; } if (strip_unit_end && (page->index > strip_unit_end)) { doutc(cl, "end of strip unit %p\n", page); unlock_page(page); break; } if (PageWriteback(page) || PagePrivate2(page) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { doutc(cl, "%p under writeback\n", page); unlock_page(page); continue; } doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { doutc(cl, "%p !clear_page_dirty_for_io\n", page); unlock_page(page); continue; } /* * We have something to write. If this is * the first locked page this time through, * calculate max possinle write size and * allocate a page array */ if (locked_pages == 0) { u64 objnum; u64 objoff; u32 xlen; /* prepare async write request */ offset = (u64)page_offset(page); ceph_calc_file_object_mapping(&ci->i_layout, offset, wsize, &objnum, &objoff, &xlen); len = xlen; num_ops = 1; strip_unit_end = page->index + ((len - 1) >> PAGE_SHIFT); BUG_ON(pages); max_pages = calc_pages_for(0, (u64)len); pages = kmalloc_array(max_pages, sizeof(*pages), GFP_NOFS); if (!pages) { from_pool = true; pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } len = 0; } else if (page->index != (offset + len) >> PAGE_SHIFT) { if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS)) { redirty_page_for_writepage(wbc, page); unlock_page(page); break; } num_ops++; offset = (u64)page_offset(page); len = 0; } /* note position of first page in fbatch */ doutc(cl, "%llx.%llx will write page %p idx %lu\n", ceph_vinop(inode), page, page->index); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( fsc->mount_options->congestion_kb)) fsc->write_congested = true; if (IS_ENCRYPTED(inode)) { pages[locked_pages] = fscrypt_encrypt_pagecache_blocks(page, PAGE_SIZE, 0, locked_pages ? GFP_NOWAIT : GFP_NOFS); if (IS_ERR(pages[locked_pages])) { if (PTR_ERR(pages[locked_pages]) == -EINVAL) pr_err_client(cl, "inode->i_blkbits=%hhu\n", inode->i_blkbits); /* better not fail on first page! */ BUG_ON(locked_pages == 0); pages[locked_pages] = NULL; redirty_page_for_writepage(wbc, page); unlock_page(page); break; } ++locked_pages; } else { pages[locked_pages++] = page; } fbatch.folios[i] = NULL; len += thp_size(page); } /* did we get anything? */ if (!locked_pages) goto release_folios; if (i) { unsigned j, n = 0; /* shift unused page to beginning of fbatch */ for (j = 0; j < nr_folios; j++) { if (!fbatch.folios[j]) continue; if (n < j) fbatch.folios[n] = fbatch.folios[j]; n++; } fbatch.nr = n; if (nr_folios && i == nr_folios && locked_pages < max_pages) { doutc(cl, "reached end fbatch, trying for more\n"); folio_batch_release(&fbatch); goto get_more_pages; } } new_request: offset = ceph_fscrypt_page_offset(pages[0]); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, num_ops, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, false); if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, min(num_ops, CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + thp_size(pages[locked_pages - 1]) - offset); if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { rc = -EIO; goto release_folios; } req->r_callback = writepages_finish; req->r_inode = inode; /* Format the osd request message and submit the write */ len = 0; data_pages = pages; op_idx = 0; for (i = 0; i < locked_pages; i++) { struct page *page = ceph_fscrypt_pagecache_page(pages[i]); u64 cur_offset = page_offset(page); /* * Discontinuity in page range? Ceph can handle that by just passing * multiple extents in the write op. */ if (offset + len != cur_offset) { /* If it's full, stop here */ if (op_idx + 1 == req->r_num_ops) break; /* Kick off an fscache write with what we have so far. */ ceph_fscache_write_to_cache(inode, offset, len, caching); /* Start a new extent */ osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); doutc(cl, "got pages at %llu~%llu\n", offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); len = 0; offset = cur_offset; data_pages = pages + i; op_idx++; } set_page_writeback(page); if (caching) ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); if (ceph_wbc.size_stable) { len = min(len, ceph_wbc.i_size - offset); } else if (i == locked_pages) { /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ u64 min_len = len + 1 - thp_size(page); len = get_writepages_data_length(inode, pages[i - 1], offset); len = max(len, min_len); } if (IS_ENCRYPTED(inode)) len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); doutc(cl, "got pages at %llu~%llu\n", offset, len); if (IS_ENCRYPTED(inode) && ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) pr_warn_client(cl, "bad encrypted write offset=%lld len=%llu\n", offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); BUG_ON(op_idx + 1 != req->r_num_ops); from_pool = false; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; locked_pages -= i; /* allocate new pages array for next request */ data_pages = pages; pages = kmalloc_array(locked_pages, sizeof(*pages), GFP_NOFS); if (!pages) { from_pool = true; pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); BUG_ON(!pages); } memcpy(pages, data_pages + i, locked_pages * sizeof(*pages)); memset(data_pages + i, 0, locked_pages * sizeof(*pages)); } else { BUG_ON(num_ops != req->r_num_ops); index = pages[i - 1]->index + 1; /* request message now owns the pages array */ pages = NULL; } req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); req = NULL; wbc->nr_to_write -= i; if (pages) goto new_request; /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = true; release_folios: doutc(cl, "folio_batch release on %d folios (%p)\n", (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); folio_batch_release(&fbatch); } if (should_loop && !done) { /* more to do; loop back to beginning of file */ doutc(cl, "looping back to beginning of file\n"); end = start_index - 1; /* OK even when start_index == 0 */ /* to write dirty pages associated with next snapc, * we need to wait until current writes complete */ if (wbc->sync_mode != WB_SYNC_NONE && start_index == 0 && /* all dirty pages were checked */ !ceph_wbc.head_snapc) { struct page *page; unsigned i, nr; index = 0; while ((index <= end) && (nr = filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, PAGECACHE_TAG_WRITEBACK, &fbatch))) { for (i = 0; i < nr; i++) { page = &fbatch.folios[i]->page; if (page_snap_context(page) != snapc) continue; wait_on_page_writeback(page); } folio_batch_release(&fbatch); cond_resched(); } } start_index = 0; index = 0; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; out: ceph_osdc_put_request(req); ceph_put_snap_context(last_snapc); doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), rc); return rc; } /* * See if a given @snapc is either writeable, or already written. */ static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); return ret; } /** * ceph_find_incompatible - find an incompatible context and return it * @page: page being dirtied * * We are only allowed to write into/dirty a page if the page is * clean, or already dirty within the same snap context. Returns a * conflicting context if there is one, NULL if there isn't, or a * negative error code on other errors. * * Must be called with page lock held. */ static struct ceph_snap_context * ceph_find_incompatible(struct page *page) { struct inode *inode = page->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (ceph_inode_is_shutdown(inode)) { doutc(cl, " %llx.%llx page %p is shutdown\n", ceph_vinop(inode), page); return ERR_PTR(-ESTALE); } for (;;) { struct ceph_snap_context *snapc, *oldest; wait_on_page_writeback(page); snapc = page_snap_context(page); if (!snapc || snapc == ci->i_head_snapc) break; /* * this page is already dirty in another (older) snap * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", ceph_vinop(inode), page, snapc); return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); /* yay, writeable, do it now (without dropping page lock) */ doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", ceph_vinop(inode), page, snapc); if (clear_page_dirty_for_io(page)) { int r = writepage_nounlock(page, NULL); if (r < 0) return ERR_PTR(r); } } return NULL; } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; folio_unlock(*foliop); folio_put(*foliop); *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); ceph_queue_writeback(inode); r = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); return r == 0 ? -EAGAIN : r; } return 0; } /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. */ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; int r; r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); if (r < 0) return r; folio_wait_private_2(folio); /* [DEPRECATED] */ WARN_ON_ONCE(!folio_test_locked(folio)); *pagep = &folio->page; return 0; } /* * we don't do anything in here that simple_write_end doesn't do * except adjust dirty page accounting */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *subpage, void *fsdata) { struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode), file, folio, (int)pos, (int)copied, (int)len); if (!folio_test_uptodate(folio)) { /* just return that nothing was copied on a short copy */ if (copied < len) { copied = 0; goto out; } folio_mark_uptodate(folio); } /* did file size increase? */ if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); folio_mark_dirty(folio); out: folio_unlock(folio); folio_put(folio); if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY); return copied; } const struct address_space_operations ceph_aops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, .dirty_folio = ceph_dirty_folio, .invalidate_folio = ceph_invalidate_folio, .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, }; static void ceph_block_sigs(sigset_t *oldset) { sigset_t mask; siginitsetinv(&mask, sigmask(SIGKILL)); sigprocmask(SIG_BLOCK, &mask, oldset); } static void ceph_restore_sigs(sigset_t *oldset) { sigprocmask(SIG_SETMASK, oldset, NULL); } /* * vm ops */ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_file_info *fi = vma->vm_file->private_data; loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; if (ceph_inode_is_shutdown(inode)) return ret; ceph_block_sigs(&oldset); doutc(cl, "%llx.%llx %llu trying to get caps\n", ceph_vinop(inode), off); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; got = 0; err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got); if (err < 0) goto out_restore; doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode), off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || !ceph_has_inline_data(ci)) { CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); ceph_del_rw_context(fi, &rw_ctx); doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n", ceph_vinop(inode), off, ceph_cap_string(got), ret); } else err = -EAGAIN; ceph_put_cap_refs(ci, got); if (err != -EAGAIN) goto out_restore; /* read inline data */ if (off >= PAGE_SIZE) { /* does not support inline data > PAGE_SIZE */ ret = VM_FAULT_SIGBUS; } else { struct address_space *mapping = inode->i_mapping; struct page *page; filemap_invalidate_lock_shared(mapping); page = find_or_create_page(mapping, 0, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out_inline; } err = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); if (err < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); ret = vmf_error(err); goto out_inline; } if (err < PAGE_SIZE) zero_user_segment(page, err, PAGE_SIZE); else flush_dcache_page(page); SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: filemap_invalidate_unlock_shared(mapping); doutc(cl, "%llx.%llx %llu read inline data ret %x\n", ceph_vinop(inode), off, ret); } out_restore: ceph_restore_sigs(&oldset); if (err < 0) ret = vmf_error(err); return ret; } static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_cap_flush *prealloc_cf; struct page *page = vmf->page; loff_t off = page_offset(page); loff_t size = i_size_read(inode); size_t len; int want, got, err; sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; if (ceph_inode_is_shutdown(inode)) return ret; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return VM_FAULT_OOM; sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); if (off + thp_size(page) <= size) len = thp_size(page); else len = offset_in_thp(page, size); doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", ceph_vinop(inode), off, len, size); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; got = 0; err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got); if (err < 0) goto out_free; doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), off, len, ceph_cap_string(got)); /* Update time before taking page lock */ file_update_time(vma->vm_file); inode_inc_iversion_raw(inode); do { struct ceph_snap_context *snapc; lock_page(page); if (page_mkwrite_check_truncate(page, inode) < 0) { unlock_page(page); ret = VM_FAULT_NOPAGE; break; } snapc = ceph_find_incompatible(page); if (!snapc) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; break; } unlock_page(page); if (IS_ERR(snapc)) { ret = VM_FAULT_SIGBUS; break; } ceph_queue_writeback(inode); err = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); } while (err == 0); if (ret == VM_FAULT_LOCKED) { int dirty; spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); } doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n", ceph_vinop(inode), off, len, ceph_cap_string(got), ret); ceph_put_cap_refs_async(ci, got); out_free: ceph_restore_sigs(&oldset); sb_end_pagefault(inode->i_sb); ceph_free_cap_flush(prealloc_cf); if (err < 0) ret = vmf_error(err); return ret; } void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len) { struct ceph_client *cl = ceph_inode_to_client(inode); struct address_space *mapping = inode->i_mapping; struct page *page; if (locked_page) { page = locked_page; } else { if (i_size_read(inode) == 0) return; page = find_or_create_page(mapping, 0, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) return; if (PageUptodate(page)) { unlock_page(page); put_page(page); return; } } doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode, ceph_vinop(inode), len, locked_page); if (len > 0) { void *kaddr = kmap_atomic(page); memcpy(kaddr, data, len); kunmap_atomic(kaddr); } if (page != locked_page) { if (len < PAGE_SIZE) zero_user_segment(page, len, PAGE_SIZE); else flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); put_page(page); } } int ceph_uninline_data(struct file *file) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; int err = 0; u64 len; spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; spin_unlock(&ci->i_ceph_lock); doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode), inline_version); if (ceph_inode_is_shutdown(inode)) { err = -EIO; goto out; } if (inline_version == CEPH_INLINE_NONE) return 0; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; if (inline_version == 1) /* initial version, no data */ goto out_uninline; folio = read_mapping_folio(inode->i_mapping, 0, file); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto out; } folio_lock(folio); len = i_size_read(inode); if (len > folio_size(folio)) len = folio_size(folio); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_unlock; } req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) goto out_unlock; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out_unlock; } pages[0] = folio_page(folio, 0); osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); { __le64 xattr_buf = cpu_to_le64(inline_version); err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, "inline_version", &xattr_buf, sizeof(xattr_buf), CEPH_OSD_CMPXATTR_OP_GT, CEPH_OSD_CMPXATTR_MODE_U64); if (err) goto out_put_req; } { char xattr_buf[32]; int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), "%llu", inline_version); err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, "inline_version", xattr_buf, xattr_len, 0, 0); if (err) goto out_put_req; } req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); out_uninline: if (!err) { int dirty; /* Set to CAP_INLINE_NONE and dirty the caps */ down_read(&fsc->mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); up_read(&fsc->mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); } out_put_req: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; out_unlock: if (folio) { folio_unlock(folio); folio_put(folio); } out: ceph_free_cap_flush(prealloc_cf); doutc(cl, "%llx.%llx inline_version %llu = %d\n", ceph_vinop(inode), inline_version, err); return err; } static const struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; vma->vm_ops = &ceph_vmops; return 0; } enum { POOL_READ = 1, POOL_WRITE = 2, }; static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool, struct ceph_string *pool_ns) { struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client *cl = fsc->client; struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; struct rb_node **p, *parent; struct ceph_pool_perm *perm; struct page **pages; size_t pool_ns_len; int err = 0, err2 = 0, have = 0; down_read(&mdsc->pool_perm_rwsem); p = &mdsc->pool_perm_tree.rb_node; while (*p) { perm = rb_entry(*p, struct ceph_pool_perm, node); if (pool < perm->pool) p = &(*p)->rb_left; else if (pool > perm->pool) p = &(*p)->rb_right; else { int ret = ceph_compare_string(pool_ns, perm->pool_ns, perm->pool_ns_len); if (ret < 0) p = &(*p)->rb_left; else if (ret > 0) p = &(*p)->rb_right; else { have = perm->perm; break; } } } up_read(&mdsc->pool_perm_rwsem); if (*p) goto out; if (pool_ns) doutc(cl, "pool %lld ns %.*s no perm cached\n", pool, (int)pool_ns->len, pool_ns->str); else doutc(cl, "pool %lld no perm cached\n", pool); down_write(&mdsc->pool_perm_rwsem); p = &mdsc->pool_perm_tree.rb_node; parent = NULL; while (*p) { parent = *p; perm = rb_entry(parent, struct ceph_pool_perm, node); if (pool < perm->pool) p = &(*p)->rb_left; else if (pool > perm->pool) p = &(*p)->rb_right; else { int ret = ceph_compare_string(pool_ns, perm->pool_ns, perm->pool_ns_len); if (ret < 0) p = &(*p)->rb_left; else if (ret > 0) p = &(*p)->rb_right; else { have = perm->perm; break; } } } if (*p) { up_write(&mdsc->pool_perm_rwsem); goto out; } rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); if (!rd_req) { err = -ENOMEM; goto out_unlock; } rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; if (pool_ns) rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns); ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); if (err) goto out_unlock; wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); if (!wr_req) { err = -ENOMEM; goto out_unlock; } wr_req->r_flags = CEPH_OSD_FLAG_WRITE; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); if (err) goto out_unlock; /* one page should be large enough for STAT data */ pages = ceph_alloc_page_vector(1, GFP_KERNEL); if (IS_ERR(pages)) { err = PTR_ERR(pages); goto out_unlock; } osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); ceph_osdc_start_request(&fsc->client->osdc, rd_req); wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode); ceph_osdc_start_request(&fsc->client->osdc, wr_req); err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); if (err >= 0 || err == -ENOENT) have |= POOL_READ; else if (err != -EPERM) { if (err == -EBLOCKLISTED) fsc->blocklisted = true; goto out_unlock; } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { if (err2 == -EBLOCKLISTED) fsc->blocklisted = true; err = err2; goto out_unlock; } pool_ns_len = pool_ns ? pool_ns->len : 0; perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS); if (!perm) { err = -ENOMEM; goto out_unlock; } perm->pool = pool; perm->perm = have; perm->pool_ns_len = pool_ns_len; if (pool_ns_len > 0) memcpy(perm->pool_ns, pool_ns->str, pool_ns_len); perm->pool_ns[pool_ns_len] = 0; rb_link_node(&perm->node, parent, p); rb_insert_color(&perm->node, &mdsc->pool_perm_tree); err = 0; out_unlock: up_write(&mdsc->pool_perm_rwsem); ceph_osdc_put_request(rd_req); ceph_osdc_put_request(wr_req); out: if (!err) err = have; if (pool_ns) doutc(cl, "pool %lld ns %.*s result = %d\n", pool, (int)pool_ns->len, pool_ns->str, err); else doutc(cl, "pool %lld result = %d\n", pool, err); return err; } int ceph_pool_perm_check(struct inode *inode, int need) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; s64 pool; int ret, flags; /* Only need to do this for regular files */ if (!S_ISREG(inode->i_mode)) return 0; if (ci->i_vino.snap != CEPH_NOSNAP) { /* * Pool permission check needs to write to the first object. * But for snapshot, head of the first object may have alread * been deleted. Skip check to avoid creating orphan object. */ return 0; } if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode), NOPOOLPERM)) return 0; spin_lock(&ci->i_ceph_lock); flags = ci->i_ceph_flags; pool = ci->i_layout.pool_id; spin_unlock(&ci->i_ceph_lock); check: if (flags & CEPH_I_POOL_PERM) { if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { doutc(cl, "pool %lld no read perm\n", pool); return -EPERM; } if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { doutc(cl, "pool %lld no write perm\n", pool); return -EPERM; } return 0; } pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); ret = __ceph_pool_perm_get(ci, pool, pool_ns); ceph_put_string(pool_ns); if (ret < 0) return ret; flags = CEPH_I_POOL_PERM; if (ret & POOL_READ) flags |= CEPH_I_POOL_RD; if (ret & POOL_WRITE) flags |= CEPH_I_POOL_WR; spin_lock(&ci->i_ceph_lock); if (pool == ci->i_layout.pool_id && pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) { ci->i_ceph_flags |= flags; } else { pool = ci->i_layout.pool_id; flags = ci->i_ceph_flags; } spin_unlock(&ci->i_ceph_lock); goto check; } void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) { struct ceph_pool_perm *perm; struct rb_node *n; while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { n = rb_first(&mdsc->pool_perm_tree); perm = rb_entry(n, struct ceph_pool_perm, node); rb_erase(n, &mdsc->pool_perm_tree); kfree(perm); } }
22 22 977 978 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 // SPDX-License-Identifier: GPL-2.0 /* * blk-integrity.c - Block layer data integrity extensions * * Copyright (C) 2007, 2008 Oracle Corporation * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ #include <linux/blk-integrity.h> #include <linux/backing-dev.h> #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> #include <linux/export.h> #include <linux/slab.h> #include "blk.h" /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @q: request queue * @bio: bio with integrity metadata attached * * Description: Returns the number of elements required in a * scatterlist corresponding to the integrity metadata in a bio. */ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) { struct bio_vec iv, ivprv = { NULL }; unsigned int segments = 0; unsigned int seg_size = 0; struct bvec_iter iter; int prev = 0; bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; if (seg_size + iv.bv_len > queue_max_segment_size(q)) goto new_segment; seg_size += iv.bv_len; } else { new_segment: segments++; seg_size = iv.bv_len; } prev = 1; ivprv = iv; } return segments; } EXPORT_SYMBOL(blk_rq_count_integrity_sg); /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist * @q: request queue * @bio: bio with integrity metadata attached * @sglist: target scatterlist * * Description: Map the integrity vectors in request into a * scatterlist. The scatterlist must be big enough to hold all * elements. I.e. sized using blk_rq_count_integrity_sg(). */ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist) { struct bio_vec iv, ivprv = { NULL }; struct scatterlist *sg = NULL; unsigned int segments = 0; struct bvec_iter iter; int prev = 0; bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; if (sg->length + iv.bv_len > queue_max_segment_size(q)) goto new_segment; sg->length += iv.bv_len; } else { new_segment: if (!sg) sg = sglist; else { sg_unmark_end(sg); sg = sg_next(sg); } sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset); segments++; } prev = 1; ivprv = iv; } if (sg) sg_mark_end(sg); return segments; } EXPORT_SYMBOL(blk_rq_map_integrity_sg); bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, struct request *next) { if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0) return true; if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0) return false; if (bio_integrity(req->bio)->bip_flags != bio_integrity(next->bio)->bip_flags) return false; if (req->nr_integrity_segments + next->nr_integrity_segments > q->limits.max_integrity_segments) return false; if (integrity_req_gap_back_merge(req, next->bio)) return false; return true; } bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, struct bio *bio) { int nr_integrity_segs; struct bio *next = bio->bi_next; if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) return true; if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL) return false; if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) return false; bio->bi_next = NULL; nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); bio->bi_next = next; if (req->nr_integrity_segments + nr_integrity_segs > q->limits.max_integrity_segments) return false; req->nr_integrity_segments += nr_integrity_segs; return true; } static inline struct blk_integrity *dev_to_bi(struct device *dev) { return &dev_to_disk(dev)->queue->limits.integrity; } const char *blk_integrity_profile_name(struct blk_integrity *bi) { switch (bi->csum_type) { case BLK_INTEGRITY_CSUM_IP: if (bi->flags & BLK_INTEGRITY_REF_TAG) return "T10-DIF-TYPE1-IP"; return "T10-DIF-TYPE3-IP"; case BLK_INTEGRITY_CSUM_CRC: if (bi->flags & BLK_INTEGRITY_REF_TAG) return "T10-DIF-TYPE1-CRC"; return "T10-DIF-TYPE3-CRC"; case BLK_INTEGRITY_CSUM_CRC64: if (bi->flags & BLK_INTEGRITY_REF_TAG) return "EXT-DIF-TYPE1-CRC64"; return "EXT-DIF-TYPE3-CRC64"; case BLK_INTEGRITY_CSUM_NONE: break; } return "nop"; } EXPORT_SYMBOL_GPL(blk_integrity_profile_name); static ssize_t flag_store(struct device *dev, const char *page, size_t count, unsigned char flag) { struct request_queue *q = dev_to_disk(dev)->queue; struct queue_limits lim; unsigned long val; int err; err = kstrtoul(page, 10, &val); if (err) return err; /* note that the flags are inverted vs the values in the sysfs files */ lim = queue_limits_start_update(q); if (val) lim.integrity.flags &= ~flag; else lim.integrity.flags |= flag; blk_mq_freeze_queue(q); err = queue_limits_commit_update(q, &lim); blk_mq_unfreeze_queue(q); if (err) return err; return count; } static ssize_t flag_show(struct device *dev, char *page, unsigned char flag) { struct blk_integrity *bi = dev_to_bi(dev); return sysfs_emit(page, "%d\n", !(bi->flags & flag)); } static ssize_t format_show(struct device *dev, struct device_attribute *attr, char *page) { struct blk_integrity *bi = dev_to_bi(dev); if (!bi->tuple_size) return sysfs_emit(page, "none\n"); return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr, char *page) { struct blk_integrity *bi = dev_to_bi(dev); return sysfs_emit(page, "%u\n", bi->tag_size); } static ssize_t protection_interval_bytes_show(struct device *dev, struct device_attribute *attr, char *page) { struct blk_integrity *bi = dev_to_bi(dev); return sysfs_emit(page, "%u\n", bi->interval_exp ? 1 << bi->interval_exp : 0); } static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { return flag_store(dev, page, count, BLK_INTEGRITY_NOVERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { return flag_show(dev, page, BLK_INTEGRITY_NOVERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { return flag_store(dev, page, count, BLK_INTEGRITY_NOGENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { return flag_show(dev, page, BLK_INTEGRITY_NOGENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, struct device_attribute *attr, char *page) { struct blk_integrity *bi = dev_to_bi(dev); return sysfs_emit(page, "%u\n", !!(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)); } static DEVICE_ATTR_RO(format); static DEVICE_ATTR_RO(tag_size); static DEVICE_ATTR_RO(protection_interval_bytes); static DEVICE_ATTR_RW(read_verify); static DEVICE_ATTR_RW(write_generate); static DEVICE_ATTR_RO(device_is_integrity_capable); static struct attribute *integrity_attrs[] = { &dev_attr_format.attr, &dev_attr_tag_size.attr, &dev_attr_protection_interval_bytes.attr, &dev_attr_read_verify.attr, &dev_attr_write_generate.attr, &dev_attr_device_is_integrity_capable.attr, NULL }; const struct attribute_group blk_integrity_attr_group = { .name = "integrity", .attrs = integrity_attrs, };
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_BUFFER_H #define _LINUX_TTY_BUFFER_H #include <linux/atomic.h> #include <linux/llist.h> #include <linux/mutex.h> #include <linux/workqueue.h> struct tty_buffer { union { struct tty_buffer *next; struct llist_node free; }; unsigned int used; unsigned int size; unsigned int commit; unsigned int lookahead; /* Lazy update on recv, can become less than "read" */ unsigned int read; bool flags; /* Data points here */ u8 data[] __aligned(sizeof(unsigned long)); }; static inline u8 *char_buf_ptr(struct tty_buffer *b, unsigned int ofs) { return b->data + ofs; } static inline u8 *flag_buf_ptr(struct tty_buffer *b, unsigned int ofs) { return char_buf_ptr(b, ofs) + b->size; } struct tty_bufhead { struct tty_buffer *head; /* Queue head */ struct work_struct work; struct mutex lock; atomic_t priority; struct tty_buffer sentinel; struct llist_head free; /* Free queue head */ atomic_t mem_used; /* In-use buffers excluding free list */ int mem_limit; struct tty_buffer *tail; /* Active buffer */ }; /* * When a break, frame error, or parity error happens, these codes are * stuffed into the flags buffer. */ #define TTY_NORMAL 0 #define TTY_BREAK 1 #define TTY_FRAME 2 #define TTY_PARITY 3 #define TTY_OVERRUN 4 #endif
13 9 8 14 13 5 5 11 6 5 5 1 4 5 1 4 5 2 2 2 9 2 8 8 8 8 5 3 2 1 3 1 2 5 2 2 3 4 5 14 8 8 12 8 5 12 12 12 3 1 2 2 8 7 8 2 6 2 2 8 8 11 11 2 8 4 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 // SPDX-License-Identifier: GPL-2.0 /* * RTC subsystem, interface functions * * Copyright (C) 2005 Tower Technologies * Author: Alessandro Zummo <a.zummo@towertech.it> * * based on arch/arm/common/rtctime.c */ #include <linux/rtc.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/log2.h> #include <linux/workqueue.h> #define CREATE_TRACE_POINTS #include <trace/events/rtc.h> static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer); static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer); static void rtc_add_offset(struct rtc_device *rtc, struct rtc_time *tm) { time64_t secs; if (!rtc->offset_secs) return; secs = rtc_tm_to_time64(tm); /* * Since the reading time values from RTC device are always in the RTC * original valid range, but we need to skip the overlapped region * between expanded range and original range, which is no need to add * the offset. */ if ((rtc->start_secs > rtc->range_min && secs >= rtc->start_secs) || (rtc->start_secs < rtc->range_min && secs <= (rtc->start_secs + rtc->range_max - rtc->range_min))) return; rtc_time64_to_tm(secs + rtc->offset_secs, tm); } static void rtc_subtract_offset(struct rtc_device *rtc, struct rtc_time *tm) { time64_t secs; if (!rtc->offset_secs) return; secs = rtc_tm_to_time64(tm); /* * If the setting time values are in the valid range of RTC hardware * device, then no need to subtract the offset when setting time to RTC * device. Otherwise we need to subtract the offset to make the time * values are valid for RTC hardware device. */ if (secs >= rtc->range_min && secs <= rtc->range_max) return; rtc_time64_to_tm(secs - rtc->offset_secs, tm); } static int rtc_valid_range(struct rtc_device *rtc, struct rtc_time *tm) { if (rtc->range_min != rtc->range_max) { time64_t time = rtc_tm_to_time64(tm); time64_t range_min = rtc->set_start_time ? rtc->start_secs : rtc->range_min; timeu64_t range_max = rtc->set_start_time ? (rtc->start_secs + rtc->range_max - rtc->range_min) : rtc->range_max; if (time < range_min || time > range_max) return -ERANGE; } return 0; } static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) { int err; if (!rtc->ops) { err = -ENODEV; } else if (!rtc->ops->read_time) { err = -EINVAL; } else { memset(tm, 0, sizeof(struct rtc_time)); err = rtc->ops->read_time(rtc->dev.parent, tm); if (err < 0) { dev_dbg(&rtc->dev, "read_time: fail to read: %d\n", err); return err; } rtc_add_offset(rtc, tm); err = rtc_valid_tm(tm); if (err < 0) dev_dbg(&rtc->dev, "read_time: rtc_time isn't valid\n"); } return err; } int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; err = __rtc_read_time(rtc, tm); mutex_unlock(&rtc->ops_lock); trace_rtc_read_time(rtc_tm_to_time64(tm), err); return err; } EXPORT_SYMBOL_GPL(rtc_read_time); int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm) { int err, uie; err = rtc_valid_tm(tm); if (err != 0) return err; err = rtc_valid_range(rtc, tm); if (err) return err; rtc_subtract_offset(rtc, tm); #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL uie = rtc->uie_rtctimer.enabled || rtc->uie_irq_active; #else uie = rtc->uie_rtctimer.enabled; #endif if (uie) { err = rtc_update_irq_enable(rtc, 0); if (err) return err; } err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (!rtc->ops) err = -ENODEV; else if (rtc->ops->set_time) err = rtc->ops->set_time(rtc->dev.parent, tm); else err = -EINVAL; pm_stay_awake(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); /* A timer might have just expired */ schedule_work(&rtc->irqwork); if (uie) { err = rtc_update_irq_enable(rtc, 1); if (err) return err; } trace_rtc_set_time(rtc_tm_to_time64(tm), err); return err; } EXPORT_SYMBOL_GPL(rtc_set_time); static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (!rtc->ops) { err = -ENODEV; } else if (!test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->read_alarm) { err = -EINVAL; } else { alarm->enabled = 0; alarm->pending = 0; alarm->time.tm_sec = -1; alarm->time.tm_min = -1; alarm->time.tm_hour = -1; alarm->time.tm_mday = -1; alarm->time.tm_mon = -1; alarm->time.tm_year = -1; alarm->time.tm_wday = -1; alarm->time.tm_yday = -1; alarm->time.tm_isdst = -1; err = rtc->ops->read_alarm(rtc->dev.parent, alarm); } mutex_unlock(&rtc->ops_lock); trace_rtc_read_alarm(rtc_tm_to_time64(&alarm->time), err); return err; } int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; struct rtc_time before, now; int first_time = 1; time64_t t_now, t_alm; enum { none, day, month, year } missing = none; unsigned int days; /* The lower level RTC driver may return -1 in some fields, * creating invalid alarm->time values, for reasons like: * * - The hardware may not be capable of filling them in; * many alarms match only on time-of-day fields, not * day/month/year calendar data. * * - Some hardware uses illegal values as "wildcard" match * values, which non-Linux firmware (like a BIOS) may try * to set up as e.g. "alarm 15 minutes after each hour". * Linux uses only oneshot alarms. * * When we see that here, we deal with it by using values from * a current RTC timestamp for any missing (-1) values. The * RTC driver prevents "periodic alarm" modes. * * But this can be racey, because some fields of the RTC timestamp * may have wrapped in the interval since we read the RTC alarm, * which would lead to us inserting inconsistent values in place * of the -1 fields. * * Reading the alarm and timestamp in the reverse sequence * would have the same race condition, and not solve the issue. * * So, we must first read the RTC timestamp, * then read the RTC alarm value, * and then read a second RTC timestamp. * * If any fields of the second timestamp have changed * when compared with the first timestamp, then we know * our timestamp may be inconsistent with that used by * the low-level rtc_read_alarm_internal() function. * * So, when the two timestamps disagree, we just loop and do * the process again to get a fully consistent set of values. * * This could all instead be done in the lower level driver, * but since more than one lower level RTC implementation needs it, * then it's probably best to do it here instead of there.. */ /* Get the "before" timestamp */ err = rtc_read_time(rtc, &before); if (err < 0) return err; do { if (!first_time) memcpy(&before, &now, sizeof(struct rtc_time)); first_time = 0; /* get the RTC alarm values, which may be incomplete */ err = rtc_read_alarm_internal(rtc, alarm); if (err) return err; /* full-function RTCs won't have such missing fields */ err = rtc_valid_tm(&alarm->time); if (!err) goto done; /* get the "after" timestamp, to detect wrapped fields */ err = rtc_read_time(rtc, &now); if (err < 0) return err; /* note that tm_sec is a "don't care" value here: */ } while (before.tm_min != now.tm_min || before.tm_hour != now.tm_hour || before.tm_mon != now.tm_mon || before.tm_year != now.tm_year); /* Fill in the missing alarm fields using the timestamp; we * know there's at least one since alarm->time is invalid. */ if (alarm->time.tm_sec == -1) alarm->time.tm_sec = now.tm_sec; if (alarm->time.tm_min == -1) alarm->time.tm_min = now.tm_min; if (alarm->time.tm_hour == -1) alarm->time.tm_hour = now.tm_hour; /* For simplicity, only support date rollover for now */ if (alarm->time.tm_mday < 1 || alarm->time.tm_mday > 31) { alarm->time.tm_mday = now.tm_mday; missing = day; } if ((unsigned int)alarm->time.tm_mon >= 12) { alarm->time.tm_mon = now.tm_mon; if (missing == none) missing = month; } if (alarm->time.tm_year == -1) { alarm->time.tm_year = now.tm_year; if (missing == none) missing = year; } /* Can't proceed if alarm is still invalid after replacing * missing fields. */ err = rtc_valid_tm(&alarm->time); if (err) goto done; /* with luck, no rollover is needed */ t_now = rtc_tm_to_time64(&now); t_alm = rtc_tm_to_time64(&alarm->time); if (t_now < t_alm) goto done; switch (missing) { /* 24 hour rollover ... if it's now 10am Monday, an alarm that * that will trigger at 5am will do so at 5am Tuesday, which * could also be in the next month or year. This is a common * case, especially for PCs. */ case day: dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day"); t_alm += 24 * 60 * 60; rtc_time64_to_tm(t_alm, &alarm->time); break; /* Month rollover ... if it's the 31th, an alarm on the 3rd will * be next month. An alarm matching on the 30th, 29th, or 28th * may end up in the month after that! Many newer PCs support * this type of alarm. */ case month: dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month"); do { if (alarm->time.tm_mon < 11) { alarm->time.tm_mon++; } else { alarm->time.tm_mon = 0; alarm->time.tm_year++; } days = rtc_month_days(alarm->time.tm_mon, alarm->time.tm_year); } while (days < alarm->time.tm_mday); break; /* Year rollover ... easy except for leap years! */ case year: dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year"); do { alarm->time.tm_year++; } while (!is_leap_year(alarm->time.tm_year + 1900) && rtc_valid_tm(&alarm->time) != 0); break; default: dev_warn(&rtc->dev, "alarm rollover not handled\n"); } err = rtc_valid_tm(&alarm->time); done: if (err && alarm->enabled) dev_warn(&rtc->dev, "invalid alarm value: %ptR\n", &alarm->time); else rtc_add_offset(rtc, &alarm->time); return err; } int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (!rtc->ops) { err = -ENODEV; } else if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) { err = -EINVAL; } else { memset(alarm, 0, sizeof(struct rtc_wkalrm)); alarm->enabled = rtc->aie_timer.enabled; alarm->time = rtc_ktime_to_tm(rtc->aie_timer.node.expires); } mutex_unlock(&rtc->ops_lock); trace_rtc_read_alarm(rtc_tm_to_time64(&alarm->time), err); return err; } EXPORT_SYMBOL_GPL(rtc_read_alarm); static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { struct rtc_time tm; time64_t now, scheduled; int err; err = rtc_valid_tm(&alarm->time); if (err) return err; scheduled = rtc_tm_to_time64(&alarm->time); /* Make sure we're not setting alarms in the past */ err = __rtc_read_time(rtc, &tm); if (err) return err; now = rtc_tm_to_time64(&tm); if (scheduled <= now) return -ETIME; /* * XXX - We just checked to make sure the alarm time is not * in the past, but there is still a race window where if * the is alarm set for the next second and the second ticks * over right here, before we set the alarm. */ rtc_subtract_offset(rtc, &alarm->time); if (!rtc->ops) err = -ENODEV; else if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) err = -EINVAL; else err = rtc->ops->set_alarm(rtc->dev.parent, alarm); trace_rtc_set_alarm(rtc_tm_to_time64(&alarm->time), err); return err; } int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { ktime_t alarm_time; int err; if (!rtc->ops) return -ENODEV; else if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) return -EINVAL; err = rtc_valid_tm(&alarm->time); if (err != 0) return err; err = rtc_valid_range(rtc, &alarm->time); if (err) return err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (rtc->aie_timer.enabled) rtc_timer_remove(rtc, &rtc->aie_timer); alarm_time = rtc_tm_to_ktime(alarm->time); /* * Round down so we never miss a deadline, checking for past deadline is * done in __rtc_set_alarm */ if (test_bit(RTC_FEATURE_ALARM_RES_MINUTE, rtc->features)) alarm_time = ktime_sub_ns(alarm_time, (u64)alarm->time.tm_sec * NSEC_PER_SEC); rtc->aie_timer.node.expires = alarm_time; rtc->aie_timer.period = 0; if (alarm->enabled) err = rtc_timer_enqueue(rtc, &rtc->aie_timer); mutex_unlock(&rtc->ops_lock); return err; } EXPORT_SYMBOL_GPL(rtc_set_alarm); /* Called once per device from rtc_device_register */ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; struct rtc_time now; err = rtc_valid_tm(&alarm->time); if (err != 0) return err; err = rtc_read_time(rtc, &now); if (err) return err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time); rtc->aie_timer.period = 0; /* Alarm has to be enabled & in the future for us to enqueue it */ if (alarm->enabled && (rtc_tm_to_ktime(now) < rtc->aie_timer.node.expires)) { rtc->aie_timer.enabled = 1; timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node); trace_rtc_timer_enqueue(&rtc->aie_timer); } mutex_unlock(&rtc->ops_lock); return err; } EXPORT_SYMBOL_GPL(rtc_initialize_alarm); int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (rtc->aie_timer.enabled != enabled) { if (enabled) err = rtc_timer_enqueue(rtc, &rtc->aie_timer); else rtc_timer_remove(rtc, &rtc->aie_timer); } if (err) /* nothing */; else if (!rtc->ops) err = -ENODEV; else if (!test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->alarm_irq_enable) err = -EINVAL; else err = rtc->ops->alarm_irq_enable(rtc->dev.parent, enabled); mutex_unlock(&rtc->ops_lock); trace_rtc_alarm_irq_enable(enabled, err); return err; } EXPORT_SYMBOL_GPL(rtc_alarm_irq_enable); int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL if (enabled == 0 && rtc->uie_irq_active) { mutex_unlock(&rtc->ops_lock); return rtc_dev_update_irq_enable_emul(rtc, 0); } #endif /* make sure we're changing state */ if (rtc->uie_rtctimer.enabled == enabled) goto out; if (!test_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->features) || !test_bit(RTC_FEATURE_ALARM, rtc->features)) { mutex_unlock(&rtc->ops_lock); #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL return rtc_dev_update_irq_enable_emul(rtc, enabled); #else return -EINVAL; #endif } if (enabled) { struct rtc_time tm; ktime_t now, onesec; err = __rtc_read_time(rtc, &tm); if (err) goto out; onesec = ktime_set(1, 0); now = rtc_tm_to_ktime(tm); rtc->uie_rtctimer.node.expires = ktime_add(now, onesec); rtc->uie_rtctimer.period = ktime_set(1, 0); err = rtc_timer_enqueue(rtc, &rtc->uie_rtctimer); } else { rtc_timer_remove(rtc, &rtc->uie_rtctimer); } out: mutex_unlock(&rtc->ops_lock); return err; } EXPORT_SYMBOL_GPL(rtc_update_irq_enable); /** * rtc_handle_legacy_irq - AIE, UIE and PIE event hook * @rtc: pointer to the rtc device * @num: number of occurence of the event * @mode: type of the event, RTC_AF, RTC_UF of RTC_PF * * This function is called when an AIE, UIE or PIE mode interrupt * has occurred (or been emulated). * */ void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode) { unsigned long flags; /* mark one irq of the appropriate mode */ spin_lock_irqsave(&rtc->irq_lock, flags); rtc->irq_data = (rtc->irq_data + (num << 8)) | (RTC_IRQF | mode); spin_unlock_irqrestore(&rtc->irq_lock, flags); wake_up_interruptible(&rtc->irq_queue); kill_fasync(&rtc->async_queue, SIGIO, POLL_IN); } /** * rtc_aie_update_irq - AIE mode rtctimer hook * @rtc: pointer to the rtc_device * * This functions is called when the aie_timer expires. */ void rtc_aie_update_irq(struct rtc_device *rtc) { rtc_handle_legacy_irq(rtc, 1, RTC_AF); } /** * rtc_uie_update_irq - UIE mode rtctimer hook * @rtc: pointer to the rtc_device * * This functions is called when the uie_timer expires. */ void rtc_uie_update_irq(struct rtc_device *rtc) { rtc_handle_legacy_irq(rtc, 1, RTC_UF); } /** * rtc_pie_update_irq - PIE mode hrtimer hook * @timer: pointer to the pie mode hrtimer * * This function is used to emulate PIE mode interrupts * using an hrtimer. This function is called when the periodic * hrtimer expires. */ enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer) { struct rtc_device *rtc; ktime_t period; u64 count; rtc = container_of(timer, struct rtc_device, pie_timer); period = NSEC_PER_SEC / rtc->irq_freq; count = hrtimer_forward_now(timer, period); rtc_handle_legacy_irq(rtc, count, RTC_PF); return HRTIMER_RESTART; } /** * rtc_update_irq - Triggered when a RTC interrupt occurs. * @rtc: the rtc device * @num: how many irqs are being reported (usually one) * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF * Context: any */ void rtc_update_irq(struct rtc_device *rtc, unsigned long num, unsigned long events) { if (IS_ERR_OR_NULL(rtc)) return; pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); } EXPORT_SYMBOL_GPL(rtc_update_irq); struct rtc_device *rtc_class_open(const char *name) { struct device *dev; struct rtc_device *rtc = NULL; dev = class_find_device_by_name(&rtc_class, name); if (dev) rtc = to_rtc_device(dev); if (rtc) { if (!try_module_get(rtc->owner)) { put_device(dev); rtc = NULL; } } return rtc; } EXPORT_SYMBOL_GPL(rtc_class_open); void rtc_class_close(struct rtc_device *rtc) { module_put(rtc->owner); put_device(&rtc->dev); } EXPORT_SYMBOL_GPL(rtc_class_close); static int rtc_update_hrtimer(struct rtc_device *rtc, int enabled) { /* * We always cancel the timer here first, because otherwise * we could run into BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); * when we manage to start the timer before the callback * returns HRTIMER_RESTART. * * We cannot use hrtimer_cancel() here as a running callback * could be blocked on rtc->irq_task_lock and hrtimer_cancel() * would spin forever. */ if (hrtimer_try_to_cancel(&rtc->pie_timer) < 0) return -1; if (enabled) { ktime_t period = NSEC_PER_SEC / rtc->irq_freq; hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL); } return 0; } /** * rtc_irq_set_state - enable/disable 2^N Hz periodic IRQs * @rtc: the rtc device * @enabled: true to enable periodic IRQs * Context: any * * Note that rtc_irq_set_freq() should previously have been used to * specify the desired frequency of periodic IRQ. */ int rtc_irq_set_state(struct rtc_device *rtc, int enabled) { int err = 0; while (rtc_update_hrtimer(rtc, enabled) < 0) cpu_relax(); rtc->pie_enabled = enabled; trace_rtc_irq_set_state(enabled, err); return err; } /** * rtc_irq_set_freq - set 2^N Hz periodic IRQ frequency for IRQ * @rtc: the rtc device * @freq: positive frequency * Context: any * * Note that rtc_irq_set_state() is used to enable or disable the * periodic IRQs. */ int rtc_irq_set_freq(struct rtc_device *rtc, int freq) { int err = 0; if (freq <= 0 || freq > RTC_MAX_FREQ) return -EINVAL; rtc->irq_freq = freq; while (rtc->pie_enabled && rtc_update_hrtimer(rtc, 1) < 0) cpu_relax(); trace_rtc_irq_set_freq(freq, err); return err; } /** * rtc_timer_enqueue - Adds a rtc_timer to the rtc_device timerqueue * @rtc: rtc device * @timer: timer being added. * * Enqueues a timer onto the rtc devices timerqueue and sets * the next alarm event appropriately. * * Sets the enabled bit on the added timer. * * Must hold ops_lock for proper serialization of timerqueue */ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) { struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue); struct rtc_time tm; ktime_t now; int err; err = __rtc_read_time(rtc, &tm); if (err) return err; timer->enabled = 1; now = rtc_tm_to_ktime(tm); /* Skip over expired timers */ while (next) { if (next->expires >= now) break; next = timerqueue_iterate_next(next); } timerqueue_add(&rtc->timerqueue, &timer->node); trace_rtc_timer_enqueue(timer); if (!next || ktime_before(timer->node.expires, next->expires)) { struct rtc_wkalrm alarm; alarm.time = rtc_ktime_to_tm(timer->node.expires); alarm.enabled = 1; err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) { pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); } else if (err) { timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; return err; } } return 0; } static void rtc_alarm_disable(struct rtc_device *rtc) { if (!rtc->ops || !test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->alarm_irq_enable) return; rtc->ops->alarm_irq_enable(rtc->dev.parent, false); trace_rtc_alarm_irq_enable(0, 0); } /** * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue * @rtc: rtc device * @timer: timer being removed. * * Removes a timer onto the rtc devices timerqueue and sets * the next alarm event appropriately. * * Clears the enabled bit on the removed timer. * * Must hold ops_lock for proper serialization of timerqueue */ static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer) { struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue); timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; if (next == &timer->node) { struct rtc_wkalrm alarm; int err; next = timerqueue_getnext(&rtc->timerqueue); if (!next) { rtc_alarm_disable(rtc); return; } alarm.time = rtc_ktime_to_tm(next->expires); alarm.enabled = 1; err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) { pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); } } } /** * rtc_timer_do_work - Expires rtc timers * @work: work item * * Expires rtc timers. Reprograms next alarm event if needed. * Called via worktask. * * Serializes access to timerqueue via ops_lock mutex */ void rtc_timer_do_work(struct work_struct *work) { struct rtc_timer *timer; struct timerqueue_node *next; ktime_t now; struct rtc_time tm; struct rtc_device *rtc = container_of(work, struct rtc_device, irqwork); mutex_lock(&rtc->ops_lock); again: __rtc_read_time(rtc, &tm); now = rtc_tm_to_ktime(tm); while ((next = timerqueue_getnext(&rtc->timerqueue))) { if (next->expires > now) break; /* expire timer */ timer = container_of(next, struct rtc_timer, node); timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; if (timer->func) timer->func(timer->rtc); trace_rtc_timer_fired(timer); /* Re-add/fwd periodic timers */ if (ktime_to_ns(timer->period)) { timer->node.expires = ktime_add(timer->node.expires, timer->period); timer->enabled = 1; timerqueue_add(&rtc->timerqueue, &timer->node); trace_rtc_timer_enqueue(timer); } } /* Set next alarm */ if (next) { struct rtc_wkalrm alarm; int err; int retry = 3; alarm.time = rtc_ktime_to_tm(next->expires); alarm.enabled = 1; reprogram: err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) { goto again; } else if (err) { if (retry-- > 0) goto reprogram; timer = container_of(next, struct rtc_timer, node); timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; dev_err(&rtc->dev, "__rtc_set_alarm: err=%d\n", err); goto again; } } else { rtc_alarm_disable(rtc); } pm_relax(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); } /* rtc_timer_init - Initializes an rtc_timer * @timer: timer to be intiialized * @f: function pointer to be called when timer fires * @rtc: pointer to the rtc_device * * Kernel interface to initializing an rtc_timer. */ void rtc_timer_init(struct rtc_timer *timer, void (*f)(struct rtc_device *r), struct rtc_device *rtc) { timerqueue_init(&timer->node); timer->enabled = 0; timer->func = f; timer->rtc = rtc; } /* rtc_timer_start - Sets an rtc_timer to fire in the future * @ rtc: rtc device to be used * @ timer: timer being set * @ expires: time at which to expire the timer * @ period: period that the timer will recur * * Kernel interface to set an rtc_timer */ int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, ktime_t expires, ktime_t period) { int ret = 0; mutex_lock(&rtc->ops_lock); if (timer->enabled) rtc_timer_remove(rtc, timer); timer->node.expires = expires; timer->period = period; ret = rtc_timer_enqueue(rtc, timer); mutex_unlock(&rtc->ops_lock); return ret; } /* rtc_timer_cancel - Stops an rtc_timer * @ rtc: rtc device to be used * @ timer: timer being set * * Kernel interface to cancel an rtc_timer */ void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer) { mutex_lock(&rtc->ops_lock); if (timer->enabled) rtc_timer_remove(rtc, timer); mutex_unlock(&rtc->ops_lock); } /** * rtc_read_offset - Read the amount of rtc offset in parts per billion * @rtc: rtc device to be used * @offset: the offset in parts per billion * * see below for details. * * Kernel interface to read rtc clock offset * Returns 0 on success, or a negative number on error. * If read_offset() is not implemented for the rtc, return -EINVAL */ int rtc_read_offset(struct rtc_device *rtc, long *offset) { int ret; if (!rtc->ops) return -ENODEV; if (!rtc->ops->read_offset) return -EINVAL; mutex_lock(&rtc->ops_lock); ret = rtc->ops->read_offset(rtc->dev.parent, offset); mutex_unlock(&rtc->ops_lock); trace_rtc_read_offset(*offset, ret); return ret; } /** * rtc_set_offset - Adjusts the duration of the average second * @rtc: rtc device to be used * @offset: the offset in parts per billion * * Some rtc's allow an adjustment to the average duration of a second * to compensate for differences in the actual clock rate due to temperature, * the crystal, capacitor, etc. * * The adjustment applied is as follows: * t = t0 * (1 + offset * 1e-9) * where t0 is the measured length of 1 RTC second with offset = 0 * * Kernel interface to adjust an rtc clock offset. * Return 0 on success, or a negative number on error. * If the rtc offset is not setable (or not implemented), return -EINVAL */ int rtc_set_offset(struct rtc_device *rtc, long offset) { int ret; if (!rtc->ops) return -ENODEV; if (!rtc->ops->set_offset) return -EINVAL; mutex_lock(&rtc->ops_lock); ret = rtc->ops->set_offset(rtc->dev.parent, offset); mutex_unlock(&rtc->ops_lock); trace_rtc_set_offset(offset, ret); return ret; }
1103 444 663 662 508 508 916 28 1977 1959 1100 509 916 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2");
135 17 121 121 138 54 63 16 3 63 71 78 11 3 76 62 139 139 53 52 192 102 26 37 203 119 42 191 85 128 192 85 127 61 70 189 82 123 31 31 31 31 1 189 174 22 84 84 113 113 185 8 184 9 158 21 22 40 72 1 128 7 184 62 39 25 62 39 39 39 13 39 3 37 25 25 25 7 7 7 15 13 2 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 /* * PCM Plug-In shared (kernel/library) code * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz> * Copyright (c) 2000 by Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #if 0 #define PLUGIN_DEBUG #endif #include <linux/slab.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include "pcm_plugin.h" #define snd_pcm_plug_first(plug) ((plug)->runtime->oss.plugin_first) #define snd_pcm_plug_last(plug) ((plug)->runtime->oss.plugin_last) /* * because some cards might have rates "very close", we ignore * all "resampling" requests within +-5% */ static int rate_match(unsigned int src_rate, unsigned int dst_rate) { unsigned int low = (src_rate * 95) / 100; unsigned int high = (src_rate * 105) / 100; return dst_rate >= low && dst_rate <= high; } static int snd_pcm_plugin_alloc(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames) { struct snd_pcm_plugin_format *format; ssize_t width; size_t size; unsigned int channel; struct snd_pcm_plugin_channel *c; if (plugin->stream == SNDRV_PCM_STREAM_PLAYBACK) { format = &plugin->src_format; } else { format = &plugin->dst_format; } width = snd_pcm_format_physical_width(format->format); if (width < 0) return width; size = array3_size(frames, format->channels, width); /* check for too large period size once again */ if (size > 1024 * 1024) return -ENOMEM; if (snd_BUG_ON(size % 8)) return -ENXIO; size /= 8; if (plugin->buf_frames < frames) { kvfree(plugin->buf); plugin->buf = kvzalloc(size, GFP_KERNEL); plugin->buf_frames = frames; } if (!plugin->buf) { plugin->buf_frames = 0; return -ENOMEM; } c = plugin->buf_channels; if (plugin->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED) { for (channel = 0; channel < format->channels; channel++, c++) { c->frames = frames; c->enabled = 1; c->wanted = 0; c->area.addr = plugin->buf; c->area.first = channel * width; c->area.step = format->channels * width; } } else if (plugin->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) { if (snd_BUG_ON(size % format->channels)) return -EINVAL; size /= format->channels; for (channel = 0; channel < format->channels; channel++, c++) { c->frames = frames; c->enabled = 1; c->wanted = 0; c->area.addr = plugin->buf + (channel * size); c->area.first = 0; c->area.step = width; } } else return -EINVAL; return 0; } int snd_pcm_plug_alloc(struct snd_pcm_substream *plug, snd_pcm_uframes_t frames) { int err; if (snd_BUG_ON(!snd_pcm_plug_first(plug))) return -ENXIO; if (snd_pcm_plug_stream(plug) == SNDRV_PCM_STREAM_PLAYBACK) { struct snd_pcm_plugin *plugin = snd_pcm_plug_first(plug); while (plugin->next) { if (plugin->dst_frames) frames = plugin->dst_frames(plugin, frames); if ((snd_pcm_sframes_t)frames <= 0) return -ENXIO; plugin = plugin->next; err = snd_pcm_plugin_alloc(plugin, frames); if (err < 0) return err; } } else { struct snd_pcm_plugin *plugin = snd_pcm_plug_last(plug); while (plugin->prev) { if (plugin->src_frames) frames = plugin->src_frames(plugin, frames); if ((snd_pcm_sframes_t)frames <= 0) return -ENXIO; plugin = plugin->prev; err = snd_pcm_plugin_alloc(plugin, frames); if (err < 0) return err; } } return 0; } snd_pcm_sframes_t snd_pcm_plugin_client_channels(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames, struct snd_pcm_plugin_channel **channels) { *channels = plugin->buf_channels; return frames; } int snd_pcm_plugin_build(struct snd_pcm_substream *plug, const char *name, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, size_t extra, struct snd_pcm_plugin **ret) { struct snd_pcm_plugin *plugin; unsigned int channels; if (snd_BUG_ON(!plug)) return -ENXIO; if (snd_BUG_ON(!src_format || !dst_format)) return -ENXIO; plugin = kzalloc(sizeof(*plugin) + extra, GFP_KERNEL); if (plugin == NULL) return -ENOMEM; plugin->name = name; plugin->plug = plug; plugin->stream = snd_pcm_plug_stream(plug); plugin->access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; plugin->src_format = *src_format; plugin->src_width = snd_pcm_format_physical_width(src_format->format); snd_BUG_ON(plugin->src_width <= 0); plugin->dst_format = *dst_format; plugin->dst_width = snd_pcm_format_physical_width(dst_format->format); snd_BUG_ON(plugin->dst_width <= 0); if (plugin->stream == SNDRV_PCM_STREAM_PLAYBACK) channels = src_format->channels; else channels = dst_format->channels; plugin->buf_channels = kcalloc(channels, sizeof(*plugin->buf_channels), GFP_KERNEL); if (plugin->buf_channels == NULL) { snd_pcm_plugin_free(plugin); return -ENOMEM; } plugin->client_channels = snd_pcm_plugin_client_channels; *ret = plugin; return 0; } int snd_pcm_plugin_free(struct snd_pcm_plugin *plugin) { if (! plugin) return 0; if (plugin->private_free) plugin->private_free(plugin); kfree(plugin->buf_channels); kvfree(plugin->buf); kfree(plugin); return 0; } static snd_pcm_sframes_t calc_dst_frames(struct snd_pcm_substream *plug, snd_pcm_sframes_t frames, bool check_size) { struct snd_pcm_plugin *plugin, *plugin_next; plugin = snd_pcm_plug_first(plug); while (plugin && frames > 0) { plugin_next = plugin->next; if (check_size && plugin->buf_frames && frames > plugin->buf_frames) frames = plugin->buf_frames; if (plugin->dst_frames) { frames = plugin->dst_frames(plugin, frames); if (frames < 0) return frames; } plugin = plugin_next; } return frames; } static snd_pcm_sframes_t calc_src_frames(struct snd_pcm_substream *plug, snd_pcm_sframes_t frames, bool check_size) { struct snd_pcm_plugin *plugin, *plugin_prev; plugin = snd_pcm_plug_last(plug); while (plugin && frames > 0) { plugin_prev = plugin->prev; if (plugin->src_frames) { frames = plugin->src_frames(plugin, frames); if (frames < 0) return frames; } if (check_size && plugin->buf_frames && frames > plugin->buf_frames) frames = plugin->buf_frames; plugin = plugin_prev; } return frames; } snd_pcm_sframes_t snd_pcm_plug_client_size(struct snd_pcm_substream *plug, snd_pcm_uframes_t drv_frames) { if (snd_BUG_ON(!plug)) return -ENXIO; switch (snd_pcm_plug_stream(plug)) { case SNDRV_PCM_STREAM_PLAYBACK: return calc_src_frames(plug, drv_frames, false); case SNDRV_PCM_STREAM_CAPTURE: return calc_dst_frames(plug, drv_frames, false); default: snd_BUG(); return -EINVAL; } } snd_pcm_sframes_t snd_pcm_plug_slave_size(struct snd_pcm_substream *plug, snd_pcm_uframes_t clt_frames) { if (snd_BUG_ON(!plug)) return -ENXIO; switch (snd_pcm_plug_stream(plug)) { case SNDRV_PCM_STREAM_PLAYBACK: return calc_dst_frames(plug, clt_frames, false); case SNDRV_PCM_STREAM_CAPTURE: return calc_src_frames(plug, clt_frames, false); default: snd_BUG(); return -EINVAL; } } static int snd_pcm_plug_formats(const struct snd_mask *mask, snd_pcm_format_t format) { struct snd_mask formats = *mask; u64 linfmts = (SNDRV_PCM_FMTBIT_U8 | SNDRV_PCM_FMTBIT_S8 | SNDRV_PCM_FMTBIT_U16_LE | SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_U16_BE | SNDRV_PCM_FMTBIT_S16_BE | SNDRV_PCM_FMTBIT_U24_LE | SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_U24_BE | SNDRV_PCM_FMTBIT_S24_BE | SNDRV_PCM_FMTBIT_U24_3LE | SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_U24_3BE | SNDRV_PCM_FMTBIT_S24_3BE | SNDRV_PCM_FMTBIT_U32_LE | SNDRV_PCM_FMTBIT_S32_LE | SNDRV_PCM_FMTBIT_U32_BE | SNDRV_PCM_FMTBIT_S32_BE); snd_mask_set(&formats, (__force int)SNDRV_PCM_FORMAT_MU_LAW); if (formats.bits[0] & lower_32_bits(linfmts)) formats.bits[0] |= lower_32_bits(linfmts); if (formats.bits[1] & upper_32_bits(linfmts)) formats.bits[1] |= upper_32_bits(linfmts); return snd_mask_test(&formats, (__force int)format); } static const snd_pcm_format_t preferred_formats[] = { SNDRV_PCM_FORMAT_S16_LE, SNDRV_PCM_FORMAT_S16_BE, SNDRV_PCM_FORMAT_U16_LE, SNDRV_PCM_FORMAT_U16_BE, SNDRV_PCM_FORMAT_S24_3LE, SNDRV_PCM_FORMAT_S24_3BE, SNDRV_PCM_FORMAT_U24_3LE, SNDRV_PCM_FORMAT_U24_3BE, SNDRV_PCM_FORMAT_S24_LE, SNDRV_PCM_FORMAT_S24_BE, SNDRV_PCM_FORMAT_U24_LE, SNDRV_PCM_FORMAT_U24_BE, SNDRV_PCM_FORMAT_S32_LE, SNDRV_PCM_FORMAT_S32_BE, SNDRV_PCM_FORMAT_U32_LE, SNDRV_PCM_FORMAT_U32_BE, SNDRV_PCM_FORMAT_S8, SNDRV_PCM_FORMAT_U8 }; snd_pcm_format_t snd_pcm_plug_slave_format(snd_pcm_format_t format, const struct snd_mask *format_mask) { int i; if (snd_mask_test(format_mask, (__force int)format)) return format; if (!snd_pcm_plug_formats(format_mask, format)) return (__force snd_pcm_format_t)-EINVAL; if (snd_pcm_format_linear(format)) { unsigned int width = snd_pcm_format_width(format); int unsignd = snd_pcm_format_unsigned(format) > 0; int big = snd_pcm_format_big_endian(format) > 0; unsigned int badness, best = -1; snd_pcm_format_t best_format = (__force snd_pcm_format_t)-1; for (i = 0; i < ARRAY_SIZE(preferred_formats); i++) { snd_pcm_format_t f = preferred_formats[i]; unsigned int w; if (!snd_mask_test(format_mask, (__force int)f)) continue; w = snd_pcm_format_width(f); if (w >= width) badness = w - width; else badness = width - w + 32; badness += snd_pcm_format_unsigned(f) != unsignd; badness += snd_pcm_format_big_endian(f) != big; if (badness < best) { best_format = f; best = badness; } } if ((__force int)best_format >= 0) return best_format; else return (__force snd_pcm_format_t)-EINVAL; } else { switch (format) { case SNDRV_PCM_FORMAT_MU_LAW: for (i = 0; i < ARRAY_SIZE(preferred_formats); ++i) { snd_pcm_format_t format1 = preferred_formats[i]; if (snd_mask_test(format_mask, (__force int)format1)) return format1; } fallthrough; default: return (__force snd_pcm_format_t)-EINVAL; } } } int snd_pcm_plug_format_plugins(struct snd_pcm_substream *plug, struct snd_pcm_hw_params *params, struct snd_pcm_hw_params *slave_params) { struct snd_pcm_plugin_format tmpformat; struct snd_pcm_plugin_format dstformat; struct snd_pcm_plugin_format srcformat; snd_pcm_access_t src_access, dst_access; struct snd_pcm_plugin *plugin = NULL; int err; int stream = snd_pcm_plug_stream(plug); int slave_interleaved = (params_channels(slave_params) == 1 || params_access(slave_params) == SNDRV_PCM_ACCESS_RW_INTERLEAVED); switch (stream) { case SNDRV_PCM_STREAM_PLAYBACK: dstformat.format = params_format(slave_params); dstformat.rate = params_rate(slave_params); dstformat.channels = params_channels(slave_params); srcformat.format = params_format(params); srcformat.rate = params_rate(params); srcformat.channels = params_channels(params); src_access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; dst_access = (slave_interleaved ? SNDRV_PCM_ACCESS_RW_INTERLEAVED : SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); break; case SNDRV_PCM_STREAM_CAPTURE: dstformat.format = params_format(params); dstformat.rate = params_rate(params); dstformat.channels = params_channels(params); srcformat.format = params_format(slave_params); srcformat.rate = params_rate(slave_params); srcformat.channels = params_channels(slave_params); src_access = (slave_interleaved ? SNDRV_PCM_ACCESS_RW_INTERLEAVED : SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); dst_access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; break; default: snd_BUG(); return -EINVAL; } tmpformat = srcformat; pdprintf("srcformat: format=%i, rate=%i, channels=%i\n", srcformat.format, srcformat.rate, srcformat.channels); pdprintf("dstformat: format=%i, rate=%i, channels=%i\n", dstformat.format, dstformat.rate, dstformat.channels); /* Format change (linearization) */ if (! rate_match(srcformat.rate, dstformat.rate) && ! snd_pcm_format_linear(srcformat.format)) { if (srcformat.format != SNDRV_PCM_FORMAT_MU_LAW) return -EINVAL; tmpformat.format = SNDRV_PCM_FORMAT_S16; err = snd_pcm_plugin_build_mulaw(plug, &srcformat, &tmpformat, &plugin); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* channels reduction */ if (srcformat.channels > dstformat.channels) { tmpformat.channels = dstformat.channels; err = snd_pcm_plugin_build_route(plug, &srcformat, &tmpformat, &plugin); pdprintf("channels reduction: src=%i, dst=%i returns %i\n", srcformat.channels, tmpformat.channels, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* rate resampling */ if (!rate_match(srcformat.rate, dstformat.rate)) { if (srcformat.format != SNDRV_PCM_FORMAT_S16) { /* convert to S16 for resampling */ tmpformat.format = SNDRV_PCM_FORMAT_S16; err = snd_pcm_plugin_build_linear(plug, &srcformat, &tmpformat, &plugin); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } tmpformat.rate = dstformat.rate; err = snd_pcm_plugin_build_rate(plug, &srcformat, &tmpformat, &plugin); pdprintf("rate down resampling: src=%i, dst=%i returns %i\n", srcformat.rate, tmpformat.rate, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* format change */ if (srcformat.format != dstformat.format) { tmpformat.format = dstformat.format; if (srcformat.format == SNDRV_PCM_FORMAT_MU_LAW || tmpformat.format == SNDRV_PCM_FORMAT_MU_LAW) { err = snd_pcm_plugin_build_mulaw(plug, &srcformat, &tmpformat, &plugin); } else if (snd_pcm_format_linear(srcformat.format) && snd_pcm_format_linear(tmpformat.format)) { err = snd_pcm_plugin_build_linear(plug, &srcformat, &tmpformat, &plugin); } else return -EINVAL; pdprintf("format change: src=%i, dst=%i returns %i\n", srcformat.format, tmpformat.format, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* channels extension */ if (srcformat.channels < dstformat.channels) { tmpformat.channels = dstformat.channels; err = snd_pcm_plugin_build_route(plug, &srcformat, &tmpformat, &plugin); pdprintf("channels extension: src=%i, dst=%i returns %i\n", srcformat.channels, tmpformat.channels, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* de-interleave */ if (src_access != dst_access) { err = snd_pcm_plugin_build_copy(plug, &srcformat, &tmpformat, &plugin); pdprintf("interleave change (copy: returns %i)\n", err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } } return 0; } snd_pcm_sframes_t snd_pcm_plug_client_channels_buf(struct snd_pcm_substream *plug, char *buf, snd_pcm_uframes_t count, struct snd_pcm_plugin_channel **channels) { struct snd_pcm_plugin *plugin; struct snd_pcm_plugin_channel *v; struct snd_pcm_plugin_format *format; int width, nchannels, channel; int stream = snd_pcm_plug_stream(plug); if (snd_BUG_ON(!buf)) return -ENXIO; if (stream == SNDRV_PCM_STREAM_PLAYBACK) { plugin = snd_pcm_plug_first(plug); format = &plugin->src_format; } else { plugin = snd_pcm_plug_last(plug); format = &plugin->dst_format; } v = plugin->buf_channels; *channels = v; width = snd_pcm_format_physical_width(format->format); if (width < 0) return width; nchannels = format->channels; if (snd_BUG_ON(plugin->access != SNDRV_PCM_ACCESS_RW_INTERLEAVED && format->channels > 1)) return -ENXIO; for (channel = 0; channel < nchannels; channel++, v++) { v->frames = count; v->enabled = 1; v->wanted = (stream == SNDRV_PCM_STREAM_CAPTURE); v->area.addr = buf; v->area.first = channel * width; v->area.step = nchannels * width; } return count; } snd_pcm_sframes_t snd_pcm_plug_write_transfer(struct snd_pcm_substream *plug, struct snd_pcm_plugin_channel *src_channels, snd_pcm_uframes_t size) { struct snd_pcm_plugin *plugin, *next; struct snd_pcm_plugin_channel *dst_channels; int err; snd_pcm_sframes_t frames = size; plugin = snd_pcm_plug_first(plug); while (plugin) { if (frames <= 0) return frames; next = plugin->next; if (next) { snd_pcm_sframes_t frames1 = frames; if (plugin->dst_frames) { frames1 = plugin->dst_frames(plugin, frames); if (frames1 <= 0) return frames1; } err = next->client_channels(next, frames1, &dst_channels); if (err < 0) return err; if (err != frames1) { frames = err; if (plugin->src_frames) { frames = plugin->src_frames(plugin, frames1); if (frames <= 0) return frames; } } } else dst_channels = NULL; pdprintf("write plugin: %s, %li\n", plugin->name, frames); frames = plugin->transfer(plugin, src_channels, dst_channels, frames); if (frames < 0) return frames; src_channels = dst_channels; plugin = next; } return calc_src_frames(plug, frames, true); } snd_pcm_sframes_t snd_pcm_plug_read_transfer(struct snd_pcm_substream *plug, struct snd_pcm_plugin_channel *dst_channels_final, snd_pcm_uframes_t size) { struct snd_pcm_plugin *plugin, *next; struct snd_pcm_plugin_channel *src_channels, *dst_channels; snd_pcm_sframes_t frames = size; int err; frames = calc_src_frames(plug, frames, true); if (frames < 0) return frames; src_channels = NULL; plugin = snd_pcm_plug_first(plug); while (plugin && frames > 0) { next = plugin->next; if (next) { err = plugin->client_channels(plugin, frames, &dst_channels); if (err < 0) return err; frames = err; } else { dst_channels = dst_channels_final; } pdprintf("read plugin: %s, %li\n", plugin->name, frames); frames = plugin->transfer(plugin, src_channels, dst_channels, frames); if (frames < 0) return frames; plugin = next; src_channels = dst_channels; } return frames; } int snd_pcm_area_silence(const struct snd_pcm_channel_area *dst_area, size_t dst_offset, size_t samples, snd_pcm_format_t format) { /* FIXME: sub byte resolution and odd dst_offset */ unsigned char *dst; unsigned int dst_step; int width; const unsigned char *silence; if (!dst_area->addr) return 0; dst = dst_area->addr + (dst_area->first + dst_area->step * dst_offset) / 8; width = snd_pcm_format_physical_width(format); if (width <= 0) return -EINVAL; if (dst_area->step == (unsigned int) width && width >= 8) return snd_pcm_format_set_silence(format, dst, samples); silence = snd_pcm_format_silence_64(format); if (! silence) return -EINVAL; dst_step = dst_area->step / 8; if (width == 4) { /* Ima ADPCM */ int dstbit = dst_area->first % 8; int dstbit_step = dst_area->step % 8; while (samples-- > 0) { if (dstbit) *dst &= 0xf0; else *dst &= 0x0f; dst += dst_step; dstbit += dstbit_step; if (dstbit == 8) { dst++; dstbit = 0; } } } else { width /= 8; while (samples-- > 0) { memcpy(dst, silence, width); dst += dst_step; } } return 0; } int snd_pcm_area_copy(const struct snd_pcm_channel_area *src_area, size_t src_offset, const struct snd_pcm_channel_area *dst_area, size_t dst_offset, size_t samples, snd_pcm_format_t format) { /* FIXME: sub byte resolution and odd dst_offset */ char *src, *dst; int width; int src_step, dst_step; src = src_area->addr + (src_area->first + src_area->step * src_offset) / 8; if (!src_area->addr) return snd_pcm_area_silence(dst_area, dst_offset, samples, format); dst = dst_area->addr + (dst_area->first + dst_area->step * dst_offset) / 8; if (!dst_area->addr) return 0; width = snd_pcm_format_physical_width(format); if (width <= 0) return -EINVAL; if (src_area->step == (unsigned int) width && dst_area->step == (unsigned int) width && width >= 8) { size_t bytes = samples * width / 8; memcpy(dst, src, bytes); return 0; } src_step = src_area->step / 8; dst_step = dst_area->step / 8; if (width == 4) { /* Ima ADPCM */ int srcbit = src_area->first % 8; int srcbit_step = src_area->step % 8; int dstbit = dst_area->first % 8; int dstbit_step = dst_area->step % 8; while (samples-- > 0) { unsigned char srcval; if (srcbit) srcval = *src & 0x0f; else srcval = (*src & 0xf0) >> 4; if (dstbit) *dst = (*dst & 0xf0) | srcval; else *dst = (*dst & 0x0f) | (srcval << 4); src += src_step; srcbit += srcbit_step; if (srcbit == 8) { src++; srcbit = 0; } dst += dst_step; dstbit += dstbit_step; if (dstbit == 8) { dst++; dstbit = 0; } } } else { width /= 8; while (samples-- > 0) { memcpy(dst, src, width); src += src_step; dst += dst_step; } } return 0; }
2 2 1 1 1 5 5 5 5 5 5 21 20 21 18 3 20 8 22 12 13 2 22 22 6 15 2 11 2 15 15 4 4 4 5 5 5 1976 1959 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2007-2012 Siemens AG * * Written by: * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/netdevice.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/ieee802154.h> #include <net/nl802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include <net/cfg802154.h> #include "ieee802154_i.h" #include "driver-ops.h" int mac802154_wpan_update_llsec(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; int rc = 0; if (ops->llsec) { struct ieee802154_llsec_params params; int changed = 0; params.pan_id = wpan_dev->pan_id; changed |= IEEE802154_LLSEC_PARAM_PAN_ID; params.hwaddr = wpan_dev->extended_addr; changed |= IEEE802154_LLSEC_PARAM_HWADDR; rc = ops->llsec->set_params(dev, &params, changed); } return rc; } static int mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct sockaddr_ieee802154 *sa = (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR) return err; rtnl_lock(); switch (cmd) { case SIOCGIFADDR: { u16 pan_id, short_addr; pan_id = le16_to_cpu(wpan_dev->pan_id); short_addr = le16_to_cpu(wpan_dev->short_addr); if (pan_id == IEEE802154_PANID_BROADCAST || short_addr == IEEE802154_ADDR_BROADCAST) { err = -EADDRNOTAVAIL; break; } sa->family = AF_IEEE802154; sa->addr.addr_type = IEEE802154_ADDR_SHORT; sa->addr.pan_id = pan_id; sa->addr.short_addr = short_addr; err = 0; break; } case SIOCSIFADDR: if (netif_running(dev)) { rtnl_unlock(); return -EBUSY; } dev_warn(&dev->dev, "Using DEBUGing ioctl SIOCSIFADDR isn't recommended!\n"); if (sa->family != AF_IEEE802154 || sa->addr.addr_type != IEEE802154_ADDR_SHORT || sa->addr.pan_id == IEEE802154_PANID_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_UNDEF) { err = -EINVAL; break; } wpan_dev->pan_id = cpu_to_le16(sa->addr.pan_id); wpan_dev->short_addr = cpu_to_le16(sa->addr.short_addr); err = mac802154_wpan_update_llsec(dev); break; } rtnl_unlock(); return err; } static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct sockaddr *addr = p; __le64 extended_addr; if (netif_running(dev)) return -EBUSY; /* lowpan need to be down for update * SLAAC address after ifup */ if (sdata->wpan_dev.lowpan_dev) { if (netif_running(sdata->wpan_dev.lowpan_dev)) return -EBUSY; } ieee802154_be64_to_le64(&extended_addr, addr->sa_data); if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; dev_addr_set(dev, addr->sa_data); sdata->wpan_dev.extended_addr = extended_addr; /* update lowpan interface mac address when * wpan mac has been changed */ if (sdata->wpan_dev.lowpan_dev) dev_addr_set(sdata->wpan_dev.lowpan_dev, dev->dev_addr); return mac802154_wpan_update_llsec(dev); } static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) { struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; sdata->required_filtering = sdata->iface_default_filtering; if (local->hw.flags & IEEE802154_HW_AFILT) { local->addr_filt.pan_id = wpan_dev->pan_id; local->addr_filt.ieee_addr = wpan_dev->extended_addr; local->addr_filt.short_addr = wpan_dev->short_addr; } if (local->hw.flags & IEEE802154_HW_LBT) { ret = drv_set_lbt_mode(local, wpan_dev->lbt); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { ret = drv_set_csma_params(local, wpan_dev->min_be, wpan_dev->max_be, wpan_dev->csma_retries); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries); if (ret < 0) return ret; } return 0; } static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; int res; ASSERT_RTNL(); set_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { res = ieee802154_setup_hw(sdata); if (res) goto err; res = drv_start(local, sdata->required_filtering, &local->addr_filt); if (res) goto err; } local->open_count++; netif_start_queue(dev); return 0; err: /* might already be clear but that doesn't matter */ clear_bit(SDATA_STATE_RUNNING, &sdata->state); return res; } static int ieee802154_check_mac_settings(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct ieee802154_sub_if_data *nsdata) { struct wpan_dev *nwpan_dev = &nsdata->wpan_dev; struct wpan_dev *wpan_dev = &sdata->wpan_dev; ASSERT_RTNL(); if (sdata->iface_default_filtering != nsdata->iface_default_filtering) return -EBUSY; if (local->hw.flags & IEEE802154_HW_AFILT) { if (wpan_dev->pan_id != nwpan_dev->pan_id || wpan_dev->short_addr != nwpan_dev->short_addr || wpan_dev->extended_addr != nwpan_dev->extended_addr) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { if (wpan_dev->min_be != nwpan_dev->min_be || wpan_dev->max_be != nwpan_dev->max_be || wpan_dev->csma_retries != nwpan_dev->csma_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { if (wpan_dev->frame_retries != nwpan_dev->frame_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_LBT) { if (wpan_dev->lbt != nwpan_dev->lbt) return -EBUSY; } return 0; } static int ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype iftype) { struct ieee802154_local *local = sdata->local; struct ieee802154_sub_if_data *nsdata; /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { if (nsdata != sdata && ieee802154_sdata_running(nsdata)) { int ret; /* TODO currently we don't support multiple node/coord * types we need to run skb_clone at rx path. Check if * there exist really an use case if we need to support * multiple node/coord types at the same time. */ if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR && nsdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) return -EBUSY; /* check all phy mac sublayer settings are the same. * We have only one phy, different values makes trouble. */ ret = ieee802154_check_mac_settings(local, sdata, nsdata); if (ret < 0) return ret; } } return 0; } static int mac802154_wpan_open(struct net_device *dev) { int rc; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; return mac802154_slave_open(dev); } static int mac802154_slave_close(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; ASSERT_RTNL(); if (mac802154_is_scanning(local)) mac802154_abort_scan_locked(local, sdata); if (mac802154_is_beaconing(local)) mac802154_stop_beacons_locked(local, sdata); netif_stop_queue(dev); local->open_count--; clear_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) ieee802154_stop_device(local); return 0; } static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata, struct ieee802154_hdr *hdr, const struct ieee802154_mac_cb *cb) { struct ieee802154_llsec_params params; u8 level; mac802154_llsec_get_params(&sdata->sec, &params); if (!params.enabled && cb->secen_override && cb->secen) return -EINVAL; if (!params.enabled || (cb->secen_override && !cb->secen) || !params.out_level) return 0; if (cb->seclevel_override && !cb->seclevel) return -EINVAL; level = cb->seclevel_override ? cb->seclevel : params.out_level; hdr->fc.security_enabled = 1; hdr->sec.level = level; hdr->sec.key_id_mode = params.out_key.mode; if (params.out_key.mode == IEEE802154_SCF_KEY_SHORT_INDEX) hdr->sec.short_src = params.out_key.short_source; else if (params.out_key.mode == IEEE802154_SCF_KEY_HW_INDEX) hdr->sec.extended_src = params.out_key.extended_source; hdr->sec.key_id = params.out_key.id; return 0; } static int ieee802154_header_create(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, const struct ieee802154_addr *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb *cb = mac_cb(skb); int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = cb->type; hdr.fc.security_enabled = cb->secen; hdr.fc.ack_request = cb->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; if (mac802154_set_header_security(sdata, &hdr, cb) < 0) return -EINVAL; if (!saddr) { if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { hdr.source.mode = IEEE802154_ADDR_LONG; hdr.source.extended_addr = wpan_dev->extended_addr; } else { hdr.source.mode = IEEE802154_ADDR_SHORT; hdr.source.short_addr = wpan_dev->short_addr; } hdr.source.pan_id = wpan_dev->pan_id; } else { hdr.source = *(const struct ieee802154_addr *)saddr; } hdr.dest = *(const struct ieee802154_addr *)daddr; hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static const struct wpan_dev_header_ops ieee802154_header_ops = { .create = ieee802154_header_create, }; /* This header create functionality assumes a 8 byte array for * source and destination pointer at maximum. To adapt this for * the 802.15.4 dataframe header we use extended address handling * here only and intra pan connection. fc fields are mostly fallback * handling. For provide dev_hard_header for dgram sockets. */ static int mac802154_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb cb = { }; int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = IEEE802154_FC_TYPE_DATA; hdr.fc.ack_request = wpan_dev->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; /* TODO currently a workaround to give zero cb block to set * security parameters defaults according MIB. */ if (mac802154_set_header_security(sdata, &hdr, &cb) < 0) return -EINVAL; hdr.dest.pan_id = wpan_dev->pan_id; hdr.dest.mode = IEEE802154_ADDR_LONG; ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr); hdr.source.pan_id = hdr.dest.pan_id; hdr.source.mode = IEEE802154_ADDR_LONG; if (!saddr) hdr.source.extended_addr = wpan_dev->extended_addr; else ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr); hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static int mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) { struct ieee802154_hdr hdr; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { pr_debug("malformed packet\n"); return 0; } if (hdr.source.mode == IEEE802154_ADDR_LONG) { ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr); return IEEE802154_EXTENDED_ADDR_LEN; } return 0; } static const struct header_ops mac802154_header_ops = { .create = mac802154_header_create, .parse = mac802154_header_parse, }; static const struct net_device_ops mac802154_wpan_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_subif_start_xmit, .ndo_do_ioctl = mac802154_wpan_ioctl, .ndo_set_mac_address = mac802154_wpan_mac_addr, }; static const struct net_device_ops mac802154_monitor_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_monitor_start_xmit, }; static void mac802154_wpan_free(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); mac802154_llsec_destroy(&sdata->sec); } static void ieee802154_if_setup(struct net_device *dev) { dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN); /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET * will not send frames without any payload, but ack frames * has no payload, so substract one that we can send a 3 bytes * frame. The xmit callback assumes at least a hard header where two * bytes fc and sequence field are set. */ dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1; /* The auth_tag header is for security and places in private payload * room of mac frame which stucks between payload and FCS field. */ dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + IEEE802154_FCS_LEN; /* The mtu size is the payload without mac header in this case. * We have a dynamic length header with a minimum header length * which is hard_header_len. In this case we let mtu to the size * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN - * hard_header_len. The FCS which is set by hardware or ndo_start_xmit * and the minimum mac header which can be evaluated inside driver * layer. The rest of mac header will be part of payload if greater * than hard_header_len. */ dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN - dev->hard_header_len; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; } static int ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; u8 tmp; /* set some type-dependent values */ sdata->wpan_dev.iftype = type; get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->bsn, tmp); get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->dsn, tmp); /* defaults per 802.15.4-2011 */ wpan_dev->min_be = 3; wpan_dev->max_be = 5; wpan_dev->csma_retries = 4; wpan_dev->frame_retries = 3; wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ieee802154_be64_to_le64(&wpan_dev->extended_addr, sdata->dev->dev_addr); sdata->dev->header_ops = &mac802154_header_ops; sdata->dev->needs_free_netdev = true; sdata->dev->priv_destructor = mac802154_wpan_free; sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); ret = mac802154_wpan_update_llsec(sdata->dev); if (ret < 0) return ret; break; case NL802154_IFTYPE_MONITOR: sdata->dev->needs_free_netdev = true; sdata->dev->netdev_ops = &mac802154_monitor_ops; sdata->iface_default_filtering = IEEE802154_FILTERING_NONE; break; default: BUG(); } return 0; } struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { u8 addr[IEEE802154_EXTENDED_ADDR_LEN]; struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; int ret; ASSERT_RTNL(); ndev = alloc_netdev(sizeof(*sdata), name, name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); ndev->needed_headroom = local->hw.extra_tx_headroom + IEEE802154_MAX_HEADER_LEN; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) goto err; ieee802154_le64_to_be64(ndev->perm_addr, &local->hw.phy->perm_extended_addr); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) { ieee802154_le64_to_be64(addr, &extended_addr); dev_addr_set(ndev, addr); } else { dev_addr_set(ndev, ndev->perm_addr); } break; case NL802154_IFTYPE_MONITOR: ndev->type = ARPHRD_IEEE802154_MONITOR; break; default: ret = -EINVAL; goto err; } /* TODO check this */ SET_NETDEV_DEV(ndev, &local->phy->dev); dev_net_set(ndev, wpan_phy_net(local->hw.phy)); sdata = netdev_priv(ndev); ndev->ieee802154_ptr = &sdata->wpan_dev; memcpy(sdata->name, ndev->name, IFNAMSIZ); sdata->dev = ndev; sdata->wpan_dev.wpan_phy = local->hw.phy; sdata->local = local; INIT_LIST_HEAD(&sdata->wpan_dev.list); /* setup type-dependent data */ ret = ieee802154_setup_sdata(sdata, type); if (ret) goto err; ret = register_netdevice(ndev); if (ret < 0) goto err; mutex_lock(&local->iflist_mtx); list_add_tail_rcu(&sdata->list, &local->interfaces); mutex_unlock(&local->iflist_mtx); return ndev; err: free_netdev(ndev); return ERR_PTR(ret); } void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) { ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); synchronize_rcu(); unregister_netdevice(sdata->dev); } void ieee802154_remove_interfaces(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata, *tmp; mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { list_del(&sdata->list); unregister_netdevice(sdata->dev); } mutex_unlock(&local->iflist_mtx); } static int netdev_notify(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ieee802154_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) return NOTIFY_DONE; if (!dev->ieee802154_ptr || !dev->ieee802154_ptr->wpan_phy) return NOTIFY_DONE; if (dev->ieee802154_ptr->wpan_phy->privid != mac802154_wpan_phy_privid) return NOTIFY_DONE; sdata = IEEE802154_DEV_TO_SUB_IF(dev); memcpy(sdata->name, dev->name, IFNAMSIZ); return NOTIFY_OK; } static struct notifier_block mac802154_netdev_notifier = { .notifier_call = netdev_notify, }; int ieee802154_iface_init(void) { return register_netdevice_notifier(&mac802154_netdev_notifier); } void ieee802154_iface_exit(void) { unregister_netdevice_notifier(&mac802154_netdev_notifier); }
67 2 3 61 1 1 2 32 4 6 16 48 4 38 10 1 2 2 28 10 7 13 2 9 2 33 11 44 42 2 33 11 43 44 14 2 27 18 21 12 9 14 7 19 2 21 21 21 21 39 33 2 3 2 2 39 3 31 4 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_police.c Input police filter * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * J Hadi Salim (action changes) */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_police.h> #include <net/tc_wrapper.h> /* Each policer is serialized by its individual spinlock */ static struct tc_action_ops act_police_ops; static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_AVRATE] = { .type = NLA_U32 }, [TCA_POLICE_RESULT] = { .type = NLA_U32 }, [TCA_POLICE_RATE64] = { .type = NLA_U64 }, [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 }, [TCA_POLICE_PKTRATE64] = { .type = NLA_U64, .min = 1 }, [TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 }, }; static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; struct tc_action_net *tn = net_generic(net, act_police_ops.net_id); struct tcf_police_params *new; bool exists = false; u32 index; u64 rate64, prate64; u64 pps, ppsburst; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_POLICE_MAX, nla, police_policy, NULL); if (err < 0) return err; if (tb[TCA_POLICE_TBF] == NULL) return -EINVAL; size = nla_len(tb[TCA_POLICE_TBF]); if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat)) return -EINVAL; parm = nla_data(tb[TCA_POLICE_TBF]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, NULL, a, &act_police_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; spin_lock_init(&(to_police(*a)->tcfp_lock)); } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; police = to_police(*a); if (parm->rate.rate) { err = -ENOMEM; R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE], NULL); if (R_tab == NULL) goto failure; if (parm->peakrate.rate) { P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE], NULL); if (P_tab == NULL) goto failure; } } if (est) { err = gen_replace_estimator(&police->tcf_bstats, police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, false, est); if (err) goto failure; } else if (tb[TCA_POLICE_AVRATE] && (ret == ACT_P_CREATED || !gen_estimator_active(&police->tcf_rate_est))) { err = -EINVAL; goto failure; } if (tb[TCA_POLICE_RESULT]) { tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); if (TC_ACT_EXT_CMP(tcfp_result, TC_ACT_GOTO_CHAIN)) { NL_SET_ERR_MSG(extack, "goto chain not allowed on fallback"); err = -EINVAL; goto failure; } } if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) || (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) { NL_SET_ERR_MSG(extack, "Both or neither packet-per-second burst and rate must be provided"); err = -EINVAL; goto failure; } if (tb[TCA_POLICE_PKTRATE64] && R_tab) { NL_SET_ERR_MSG(extack, "packet-per-second and byte-per-second rate limits not allowed in same action"); err = -EINVAL; goto failure; } new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; goto failure; } /* No failure allowed after this point */ new->tcfp_result = tcfp_result; new->tcfp_mtu = parm->mtu; if (!new->tcfp_mtu) { new->tcfp_mtu = ~0; if (R_tab) new->tcfp_mtu = 255 << R_tab->rate.cell_log; } if (R_tab) { new->rate_present = true; rate64 = tb[TCA_POLICE_RATE64] ? nla_get_u64(tb[TCA_POLICE_RATE64]) : 0; psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64); qdisc_put_rtab(R_tab); } else { new->rate_present = false; } if (P_tab) { new->peak_present = true; prate64 = tb[TCA_POLICE_PEAKRATE64] ? nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0; psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64); qdisc_put_rtab(P_tab); } else { new->peak_present = false; } new->tcfp_burst = PSCHED_TICKS2NS(parm->burst); if (new->peak_present) new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak, new->tcfp_mtu); if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); if (tb[TCA_POLICE_PKTRATE64]) { pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]); ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]); new->pps_present = true; new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst); psched_ppscfg_precompute(&new->ppsrate, pps); } spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); police->tcfp_toks = new->tcfp_burst; if (new->peak_present) police->tcfp_ptoks = new->tcfp_mtu_ptoks; spin_unlock_bh(&police->tcfp_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); new = rcu_replace_pointer(police->params, new, lockdep_is_held(&police->tcf_lock)); spin_unlock_bh(&police->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (new) kfree_rcu(new, rcu); return ret; failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit) { u32 len; if (skb_is_gso(skb)) return skb_gso_validate_mac_len(skb, limit); len = qdisc_pkt_len(skb); if (skb_at_tc_ingress(skb)) len += skb->mac_len; return len <= limit; } TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); s64 now, toks, ppstoks = 0, ptoks = 0; struct tcf_police_params *p; int ret; tcf_lastuse_update(&police->tcf_tm); bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; if (!gen_estimator_read(&police->tcf_rate_est, &sample) || sample.bps >= p->tcfp_ewma_rate) goto inc_overlimits; } if (tcf_police_mtu_check(skb, p->tcfp_mtu)) { if (!p->rate_present && !p->pps_present) { ret = p->tcfp_result; goto end; } now = ktime_get_ns(); spin_lock_bh(&police->tcfp_lock); toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst); if (p->peak_present) { ptoks = toks + police->tcfp_ptoks; if (ptoks > p->tcfp_mtu_ptoks) ptoks = p->tcfp_mtu_ptoks; ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } if (p->rate_present) { toks += police->tcfp_toks; if (toks > p->tcfp_burst) toks = p->tcfp_burst; toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); } else if (p->pps_present) { ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst); ppstoks += police->tcfp_pkttoks; if (ppstoks > p->tcfp_pkt_burst) ppstoks = p->tcfp_pkt_burst; ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1); } if ((toks | ptoks | ppstoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; police->tcfp_ptoks = ptoks; police->tcfp_pkttoks = ppstoks; spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; } spin_unlock_bh(&police->tcfp_lock); } inc_overlimits: qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats)); inc_drops: if (ret == TC_ACT_SHOT) qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats)); end: return ret; } static void tcf_police_cleanup(struct tc_action *a) { struct tcf_police *police = to_police(a); struct tcf_police_params *p; p = rcu_dereference_protected(police->params, 1); if (p) kfree_rcu(p, rcu); } static void tcf_police_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_police *police = to_police(a); struct tcf_t *tm = &police->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = to_police(a); struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&police->tcf_lock); opt.action = police->tcf_action; p = rcu_dereference_protected(police->params, lockdep_is_held(&police->tcf_lock)); opt.mtu = p->tcfp_mtu; opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); if (p->rate_present) { psched_ratecfg_getrate(&opt.rate, &p->rate); if ((p->rate.rate_bytes_ps >= (1ULL << 32)) && nla_put_u64_64bit(skb, TCA_POLICE_RATE64, p->rate.rate_bytes_ps, TCA_POLICE_PAD)) goto nla_put_failure; } if (p->peak_present) { psched_ratecfg_getrate(&opt.peakrate, &p->peak); if ((p->peak.rate_bytes_ps >= (1ULL << 32)) && nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64, p->peak.rate_bytes_ps, TCA_POLICE_PAD)) goto nla_put_failure; } if (p->pps_present) { if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64, p->ppsrate.rate_pkts_ps, TCA_POLICE_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64, PSCHED_NS2TICKS(p->tcfp_pkt_burst), TCA_POLICE_PAD)) goto nla_put_failure; } if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfp_result && nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result)) goto nla_put_failure; if (p->tcfp_ewma_rate && nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate)) goto nla_put_failure; tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; spin_unlock_bh(&police->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&police->tcf_lock); nlmsg_trim(skb, b); return -1; } static int tcf_police_act_to_flow_act(int tc_act, u32 *extval, struct netlink_ext_ack *extack) { int act_id = -EOPNOTSUPP; if (!TC_ACT_EXT_OPCODE(tc_act)) { if (tc_act == TC_ACT_OK) act_id = FLOW_ACTION_ACCEPT; else if (tc_act == TC_ACT_SHOT) act_id = FLOW_ACTION_DROP; else if (tc_act == TC_ACT_PIPE) act_id = FLOW_ACTION_PIPE; else if (tc_act == TC_ACT_RECLASSIFY) NL_SET_ERR_MSG_MOD(extack, "Offload not supported when conform/exceed action is \"reclassify\""); else NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload"); } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_GOTO_CHAIN)) { act_id = FLOW_ACTION_GOTO; *extval = tc_act & TC_ACT_EXT_VAL_MASK; } else if (TC_ACT_EXT_CMP(tc_act, TC_ACT_JUMP)) { act_id = FLOW_ACTION_JUMP; *extval = tc_act & TC_ACT_EXT_VAL_MASK; } else if (tc_act == TC_ACT_UNSPEC) { act_id = FLOW_ACTION_CONTINUE; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported conform/exceed action offload"); } return act_id; } static int tcf_police_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; struct tcf_police *police = to_police(act); struct tcf_police_params *p; int act_id; p = rcu_dereference_protected(police->params, lockdep_is_held(&police->tcf_lock)); entry->id = FLOW_ACTION_POLICE; entry->police.burst = tcf_police_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); entry->police.peakrate_bytes_ps = tcf_police_peakrate_bytes_ps(act); entry->police.avrate = tcf_police_tcfp_ewma_rate(act); entry->police.overhead = tcf_police_rate_overhead(act); entry->police.burst_pkt = tcf_police_burst_pkt(act); entry->police.rate_pkt_ps = tcf_police_rate_pkt_ps(act); entry->police.mtu = tcf_police_tcfp_mtu(act); act_id = tcf_police_act_to_flow_act(police->tcf_action, &entry->police.exceed.extval, extack); if (act_id < 0) return act_id; entry->police.exceed.act_id = act_id; act_id = tcf_police_act_to_flow_act(p->tcfp_result, &entry->police.notexceed.extval, extack); if (act_id < 0) return act_id; entry->police.notexceed.act_id = act_id; *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_POLICE; } return 0; } MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); static struct tc_action_ops act_police_ops = { .kind = "police", .id = TCA_ID_POLICE, .owner = THIS_MODULE, .stats_update = tcf_police_stats_update, .act = tcf_police_act, .dump = tcf_police_dump, .init = tcf_police_init, .cleanup = tcf_police_cleanup, .offload_act_setup = tcf_police_offload_act_setup, .size = sizeof(struct tcf_police), }; MODULE_ALIAS_NET_ACT("police"); static __net_init int police_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_police_ops.net_id); return tc_action_net_init(net, tn, &act_police_ops); } static void __net_exit police_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_police_ops.net_id); } static struct pernet_operations police_net_ops = { .init = police_init_net, .exit_batch = police_exit_net, .id = &act_police_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init police_init_module(void) { return tcf_register_action(&act_police_ops, &police_net_ops); } static void __exit police_cleanup_module(void) { tcf_unregister_action(&act_police_ops, &police_net_ops); } module_init(police_init_module); module_exit(police_cleanup_module);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 /* * ipheth.c - Apple iPhone USB Ethernet driver * * Copyright (c) 2009 Diego Giagio <diego@giagio.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of GIAGIO.COM nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * * Attention: iPhone device must be paired, otherwise it won't respond to our * driver. For more info: http://giagio.com/wiki/moin.cgi/iPhoneEthernetDriver * */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/usb.h> #include <linux/workqueue.h> #include <linux/usb/cdc.h> #define USB_VENDOR_APPLE 0x05ac #define IPHETH_USBINTF_CLASS 255 #define IPHETH_USBINTF_SUBCLASS 253 #define IPHETH_USBINTF_PROTO 1 #define IPHETH_IP_ALIGN 2 /* padding at front of URB */ #define IPHETH_NCM_HEADER_SIZE (12 + 96) /* NCMH + NCM0 */ #define IPHETH_TX_BUF_SIZE ETH_FRAME_LEN #define IPHETH_RX_BUF_SIZE_LEGACY (IPHETH_IP_ALIGN + ETH_FRAME_LEN) #define IPHETH_RX_BUF_SIZE_NCM 65536 #define IPHETH_TX_TIMEOUT (5 * HZ) #define IPHETH_INTFNUM 2 #define IPHETH_ALT_INTFNUM 1 #define IPHETH_CTRL_ENDP 0x00 #define IPHETH_CTRL_BUF_SIZE 0x40 #define IPHETH_CTRL_TIMEOUT (5 * HZ) #define IPHETH_CMD_GET_MACADDR 0x00 #define IPHETH_CMD_ENABLE_NCM 0x04 #define IPHETH_CMD_CARRIER_CHECK 0x45 #define IPHETH_CARRIER_CHECK_TIMEOUT round_jiffies_relative(1 * HZ) #define IPHETH_CARRIER_ON 0x04 static const struct usb_device_id ipheth_table[] = { { USB_VENDOR_AND_INTERFACE_INFO(USB_VENDOR_APPLE, IPHETH_USBINTF_CLASS, IPHETH_USBINTF_SUBCLASS, IPHETH_USBINTF_PROTO) }, { } }; MODULE_DEVICE_TABLE(usb, ipheth_table); struct ipheth_device { struct usb_device *udev; struct usb_interface *intf; struct net_device *net; struct urb *tx_urb; struct urb *rx_urb; unsigned char *tx_buf; unsigned char *rx_buf; unsigned char *ctrl_buf; u8 bulk_in; u8 bulk_out; struct delayed_work carrier_work; bool confirmed_pairing; int (*rcvbulk_callback)(struct urb *urb); size_t rx_buf_len; }; static int ipheth_rx_submit(struct ipheth_device *dev, gfp_t mem_flags); static int ipheth_alloc_urbs(struct ipheth_device *iphone) { struct urb *tx_urb = NULL; struct urb *rx_urb = NULL; u8 *tx_buf = NULL; u8 *rx_buf = NULL; tx_urb = usb_alloc_urb(0, GFP_KERNEL); if (tx_urb == NULL) goto error_nomem; rx_urb = usb_alloc_urb(0, GFP_KERNEL); if (rx_urb == NULL) goto free_tx_urb; tx_buf = usb_alloc_coherent(iphone->udev, IPHETH_TX_BUF_SIZE, GFP_KERNEL, &tx_urb->transfer_dma); if (tx_buf == NULL) goto free_rx_urb; rx_buf = usb_alloc_coherent(iphone->udev, iphone->rx_buf_len, GFP_KERNEL, &rx_urb->transfer_dma); if (rx_buf == NULL) goto free_tx_buf; iphone->tx_urb = tx_urb; iphone->rx_urb = rx_urb; iphone->tx_buf = tx_buf; iphone->rx_buf = rx_buf; return 0; free_tx_buf: usb_free_coherent(iphone->udev, IPHETH_TX_BUF_SIZE, tx_buf, tx_urb->transfer_dma); free_rx_urb: usb_free_urb(rx_urb); free_tx_urb: usb_free_urb(tx_urb); error_nomem: return -ENOMEM; } static void ipheth_free_urbs(struct ipheth_device *iphone) { usb_free_coherent(iphone->udev, iphone->rx_buf_len, iphone->rx_buf, iphone->rx_urb->transfer_dma); usb_free_coherent(iphone->udev, IPHETH_TX_BUF_SIZE, iphone->tx_buf, iphone->tx_urb->transfer_dma); usb_free_urb(iphone->rx_urb); usb_free_urb(iphone->tx_urb); } static void ipheth_kill_urbs(struct ipheth_device *dev) { usb_kill_urb(dev->tx_urb); usb_kill_urb(dev->rx_urb); } static int ipheth_consume_skb(char *buf, int len, struct ipheth_device *dev) { struct sk_buff *skb; skb = dev_alloc_skb(len); if (!skb) { dev->net->stats.rx_dropped++; return -ENOMEM; } skb_put_data(skb, buf, len); skb->dev = dev->net; skb->protocol = eth_type_trans(skb, dev->net); dev->net->stats.rx_packets++; dev->net->stats.rx_bytes += len; netif_rx(skb); return 0; } static int ipheth_rcvbulk_callback_legacy(struct urb *urb) { struct ipheth_device *dev; char *buf; int len; dev = urb->context; if (urb->actual_length <= IPHETH_IP_ALIGN) { dev->net->stats.rx_length_errors++; return -EINVAL; } len = urb->actual_length - IPHETH_IP_ALIGN; buf = urb->transfer_buffer + IPHETH_IP_ALIGN; return ipheth_consume_skb(buf, len, dev); } static int ipheth_rcvbulk_callback_ncm(struct urb *urb) { struct usb_cdc_ncm_nth16 *ncmh; struct usb_cdc_ncm_ndp16 *ncm0; struct usb_cdc_ncm_dpe16 *dpe; struct ipheth_device *dev; int retval = -EINVAL; char *buf; int len; dev = urb->context; if (urb->actual_length < IPHETH_NCM_HEADER_SIZE) { dev->net->stats.rx_length_errors++; return retval; } ncmh = urb->transfer_buffer; if (ncmh->dwSignature != cpu_to_le32(USB_CDC_NCM_NTH16_SIGN) || le16_to_cpu(ncmh->wNdpIndex) >= urb->actual_length) { dev->net->stats.rx_errors++; return retval; } ncm0 = urb->transfer_buffer + le16_to_cpu(ncmh->wNdpIndex); if (ncm0->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN) || le16_to_cpu(ncmh->wHeaderLength) + le16_to_cpu(ncm0->wLength) >= urb->actual_length) { dev->net->stats.rx_errors++; return retval; } dpe = ncm0->dpe16; while (le16_to_cpu(dpe->wDatagramIndex) != 0 && le16_to_cpu(dpe->wDatagramLength) != 0) { if (le16_to_cpu(dpe->wDatagramIndex) >= urb->actual_length || le16_to_cpu(dpe->wDatagramIndex) + le16_to_cpu(dpe->wDatagramLength) > urb->actual_length) { dev->net->stats.rx_length_errors++; return retval; } buf = urb->transfer_buffer + le16_to_cpu(dpe->wDatagramIndex); len = le16_to_cpu(dpe->wDatagramLength); retval = ipheth_consume_skb(buf, len, dev); if (retval != 0) return retval; dpe++; } return 0; } static void ipheth_rcvbulk_callback(struct urb *urb) { struct ipheth_device *dev; int retval, status; dev = urb->context; if (dev == NULL) return; status = urb->status; switch (status) { case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: case -EPROTO: return; case 0: break; default: dev_err(&dev->intf->dev, "%s: urb status: %d\n", __func__, status); return; } /* iPhone may periodically send URBs with no payload * on the "bulk in" endpoint. It is safe to ignore them. */ if (urb->actual_length == 0) goto rx_submit; /* RX URBs starting with 0x00 0x01 do not encapsulate Ethernet frames, * but rather are control frames. Their purpose is not documented, and * they don't affect driver functionality, okay to drop them. * There is usually just one 4-byte control frame as the very first * URB received from the bulk IN endpoint. */ if (unlikely (urb->actual_length == 4 && ((char *)urb->transfer_buffer)[0] == 0 && ((char *)urb->transfer_buffer)[1] == 1)) goto rx_submit; retval = dev->rcvbulk_callback(urb); if (retval != 0) { dev_err(&dev->intf->dev, "%s: callback retval: %d\n", __func__, retval); } rx_submit: dev->confirmed_pairing = true; ipheth_rx_submit(dev, GFP_ATOMIC); } static void ipheth_sndbulk_callback(struct urb *urb) { struct ipheth_device *dev; int status = urb->status; dev = urb->context; if (dev == NULL) return; if (status != 0 && status != -ENOENT && status != -ECONNRESET && status != -ESHUTDOWN) dev_err(&dev->intf->dev, "%s: urb status: %d\n", __func__, status); if (status == 0) netif_wake_queue(dev->net); else // on URB error, trigger immediate poll schedule_delayed_work(&dev->carrier_work, 0); } static int ipheth_carrier_set(struct ipheth_device *dev) { struct usb_device *udev; int retval; if (!dev->confirmed_pairing) return 0; udev = dev->udev; retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, IPHETH_CTRL_ENDP), IPHETH_CMD_CARRIER_CHECK, /* request */ 0xc0, /* request type */ 0x00, /* value */ 0x02, /* index */ dev->ctrl_buf, IPHETH_CTRL_BUF_SIZE, IPHETH_CTRL_TIMEOUT); if (retval <= 0) { dev_err(&dev->intf->dev, "%s: usb_control_msg: %d\n", __func__, retval); return retval; } if ((retval == 1 && dev->ctrl_buf[0] == IPHETH_CARRIER_ON) || (retval >= 2 && dev->ctrl_buf[1] == IPHETH_CARRIER_ON)) { netif_carrier_on(dev->net); if (dev->tx_urb->status != -EINPROGRESS) netif_wake_queue(dev->net); } else { netif_carrier_off(dev->net); netif_stop_queue(dev->net); } return 0; } static void ipheth_carrier_check_work(struct work_struct *work) { struct ipheth_device *dev = container_of(work, struct ipheth_device, carrier_work.work); ipheth_carrier_set(dev); schedule_delayed_work(&dev->carrier_work, IPHETH_CARRIER_CHECK_TIMEOUT); } static int ipheth_get_macaddr(struct ipheth_device *dev) { struct usb_device *udev = dev->udev; struct net_device *net = dev->net; int retval; retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, IPHETH_CTRL_ENDP), IPHETH_CMD_GET_MACADDR, /* request */ 0xc0, /* request type */ 0x00, /* value */ 0x02, /* index */ dev->ctrl_buf, IPHETH_CTRL_BUF_SIZE, IPHETH_CTRL_TIMEOUT); if (retval < 0) { dev_err(&dev->intf->dev, "%s: usb_control_msg: %d\n", __func__, retval); } else if (retval < ETH_ALEN) { dev_err(&dev->intf->dev, "%s: usb_control_msg: short packet: %d bytes\n", __func__, retval); retval = -EINVAL; } else { eth_hw_addr_set(net, dev->ctrl_buf); retval = 0; } return retval; } static int ipheth_enable_ncm(struct ipheth_device *dev) { struct usb_device *udev = dev->udev; int retval; retval = usb_control_msg(udev, usb_sndctrlpipe(udev, IPHETH_CTRL_ENDP), IPHETH_CMD_ENABLE_NCM, /* request */ 0x41, /* request type */ 0x00, /* value */ 0x02, /* index */ NULL, 0, IPHETH_CTRL_TIMEOUT); dev_info(&dev->intf->dev, "%s: usb_control_msg: %d\n", __func__, retval); return retval; } static int ipheth_rx_submit(struct ipheth_device *dev, gfp_t mem_flags) { struct usb_device *udev = dev->udev; int retval; usb_fill_bulk_urb(dev->rx_urb, udev, usb_rcvbulkpipe(udev, dev->bulk_in), dev->rx_buf, dev->rx_buf_len, ipheth_rcvbulk_callback, dev); dev->rx_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; retval = usb_submit_urb(dev->rx_urb, mem_flags); if (retval) dev_err(&dev->intf->dev, "%s: usb_submit_urb: %d\n", __func__, retval); return retval; } static int ipheth_open(struct net_device *net) { struct ipheth_device *dev = netdev_priv(net); struct usb_device *udev = dev->udev; int retval = 0; usb_set_interface(udev, IPHETH_INTFNUM, IPHETH_ALT_INTFNUM); retval = ipheth_carrier_set(dev); if (retval) return retval; retval = ipheth_rx_submit(dev, GFP_KERNEL); if (retval) return retval; schedule_delayed_work(&dev->carrier_work, IPHETH_CARRIER_CHECK_TIMEOUT); return retval; } static int ipheth_close(struct net_device *net) { struct ipheth_device *dev = netdev_priv(net); netif_stop_queue(net); cancel_delayed_work_sync(&dev->carrier_work); return 0; } static netdev_tx_t ipheth_tx(struct sk_buff *skb, struct net_device *net) { struct ipheth_device *dev = netdev_priv(net); struct usb_device *udev = dev->udev; int retval; /* Paranoid */ if (skb->len > IPHETH_TX_BUF_SIZE) { WARN(1, "%s: skb too large: %d bytes\n", __func__, skb->len); dev->net->stats.tx_dropped++; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } memcpy(dev->tx_buf, skb->data, skb->len); usb_fill_bulk_urb(dev->tx_urb, udev, usb_sndbulkpipe(udev, dev->bulk_out), dev->tx_buf, skb->len, ipheth_sndbulk_callback, dev); dev->tx_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; netif_stop_queue(net); retval = usb_submit_urb(dev->tx_urb, GFP_ATOMIC); if (retval) { dev_err(&dev->intf->dev, "%s: usb_submit_urb: %d\n", __func__, retval); dev->net->stats.tx_errors++; dev_kfree_skb_any(skb); netif_wake_queue(net); } else { dev->net->stats.tx_packets++; dev->net->stats.tx_bytes += skb->len; dev_consume_skb_any(skb); } return NETDEV_TX_OK; } static void ipheth_tx_timeout(struct net_device *net, unsigned int txqueue) { struct ipheth_device *dev = netdev_priv(net); dev_err(&dev->intf->dev, "%s: TX timeout\n", __func__); dev->net->stats.tx_errors++; usb_unlink_urb(dev->tx_urb); } static u32 ipheth_ethtool_op_get_link(struct net_device *net) { struct ipheth_device *dev = netdev_priv(net); return netif_carrier_ok(dev->net); } static const struct ethtool_ops ops = { .get_link = ipheth_ethtool_op_get_link }; static const struct net_device_ops ipheth_netdev_ops = { .ndo_open = ipheth_open, .ndo_stop = ipheth_close, .ndo_start_xmit = ipheth_tx, .ndo_tx_timeout = ipheth_tx_timeout, }; static int ipheth_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct usb_host_interface *hintf; struct usb_endpoint_descriptor *endp; struct ipheth_device *dev; struct net_device *netdev; int i; int retval; netdev = alloc_etherdev(sizeof(struct ipheth_device)); if (!netdev) return -ENOMEM; netdev->netdev_ops = &ipheth_netdev_ops; netdev->watchdog_timeo = IPHETH_TX_TIMEOUT; strscpy(netdev->name, "eth%d", sizeof(netdev->name)); dev = netdev_priv(netdev); dev->udev = udev; dev->net = netdev; dev->intf = intf; dev->confirmed_pairing = false; dev->rx_buf_len = IPHETH_RX_BUF_SIZE_LEGACY; dev->rcvbulk_callback = ipheth_rcvbulk_callback_legacy; /* Set up endpoints */ hintf = usb_altnum_to_altsetting(intf, IPHETH_ALT_INTFNUM); if (hintf == NULL) { retval = -ENODEV; dev_err(&intf->dev, "Unable to find alternate settings interface\n"); goto err_endpoints; } for (i = 0; i < hintf->desc.bNumEndpoints; i++) { endp = &hintf->endpoint[i].desc; if (usb_endpoint_is_bulk_in(endp)) dev->bulk_in = endp->bEndpointAddress; else if (usb_endpoint_is_bulk_out(endp)) dev->bulk_out = endp->bEndpointAddress; } if (!(dev->bulk_in && dev->bulk_out)) { retval = -ENODEV; dev_err(&intf->dev, "Unable to find endpoints\n"); goto err_endpoints; } dev->ctrl_buf = kmalloc(IPHETH_CTRL_BUF_SIZE, GFP_KERNEL); if (dev->ctrl_buf == NULL) { retval = -ENOMEM; goto err_alloc_ctrl_buf; } retval = ipheth_get_macaddr(dev); if (retval) goto err_get_macaddr; retval = ipheth_enable_ncm(dev); if (!retval) { dev->rx_buf_len = IPHETH_RX_BUF_SIZE_NCM; dev->rcvbulk_callback = ipheth_rcvbulk_callback_ncm; } INIT_DELAYED_WORK(&dev->carrier_work, ipheth_carrier_check_work); retval = ipheth_alloc_urbs(dev); if (retval) { dev_err(&intf->dev, "error allocating urbs: %d\n", retval); goto err_alloc_urbs; } usb_set_intfdata(intf, dev); SET_NETDEV_DEV(netdev, &intf->dev); netdev->ethtool_ops = &ops; retval = register_netdev(netdev); if (retval) { dev_err(&intf->dev, "error registering netdev: %d\n", retval); retval = -EIO; goto err_register_netdev; } // carrier down and transmit queues stopped until packet from device netif_carrier_off(netdev); netif_tx_stop_all_queues(netdev); dev_info(&intf->dev, "Apple iPhone USB Ethernet device attached\n"); return 0; err_register_netdev: ipheth_free_urbs(dev); err_alloc_urbs: err_get_macaddr: kfree(dev->ctrl_buf); err_alloc_ctrl_buf: err_endpoints: free_netdev(netdev); return retval; } static void ipheth_disconnect(struct usb_interface *intf) { struct ipheth_device *dev; dev = usb_get_intfdata(intf); if (dev != NULL) { unregister_netdev(dev->net); ipheth_kill_urbs(dev); ipheth_free_urbs(dev); kfree(dev->ctrl_buf); free_netdev(dev->net); } usb_set_intfdata(intf, NULL); dev_info(&intf->dev, "Apple iPhone USB Ethernet now disconnected\n"); } static struct usb_driver ipheth_driver = { .name = "ipheth", .probe = ipheth_probe, .disconnect = ipheth_disconnect, .id_table = ipheth_table, .disable_hub_initiated_lpm = 1, }; module_usb_driver(ipheth_driver); MODULE_AUTHOR("Diego Giagio <diego@giagio.com>"); MODULE_DESCRIPTION("Apple iPhone USB Ethernet driver"); MODULE_LICENSE("Dual BSD/GPL");
1 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Network filesystem support services. * * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See: * * Documentation/filesystems/netfs_library.rst * * for a description of the network filesystem interface declared here. */ #ifndef _LINUX_NETFS_H #define _LINUX_NETFS_H #include <linux/workqueue.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/uio.h> enum netfs_sreq_ref_trace; typedef struct mempool_s mempool_t; /** * folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED] * @folio: The folio. * * Call this function before writing a folio to a local cache. Starting a * second write before the first one finishes is not allowed. * * Note that this should no longer be used. */ static inline void folio_start_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio); folio_get(folio); folio_set_private_2(folio); } /* Marks used on xarray-based buffers */ #define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ #define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ enum netfs_io_source { NETFS_FILL_WITH_ZEROES, NETFS_DOWNLOAD_FROM_SERVER, NETFS_READ_FROM_CACHE, NETFS_INVALID_READ, NETFS_UPLOAD_TO_SERVER, NETFS_WRITE_TO_CACHE, NETFS_INVALID_WRITE, } __mode(byte); typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error, bool was_async); /* * Per-inode context. This wraps the VFS inode. */ struct netfs_inode { struct inode inode; /* The VFS inode */ const struct netfs_request_ops *ops; #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cache; #endif struct mutex wb_lock; /* Writeback serialisation */ loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ }; /* * A netfs group - for instance a ceph snap. This is marked on dirty pages and * pages marked with a group must be flushed before they can be written under * the domain of another group. */ struct netfs_group { refcount_t ref; void (*free)(struct netfs_group *netfs_group); }; /* * Information about a dirty page (attached only if necessary). * folio->private */ struct netfs_folio { struct netfs_group *netfs_group; /* Filesystem's grouping marker (or NULL). */ unsigned int dirty_offset; /* Write-streaming dirty data offset */ unsigned int dirty_len; /* Write-streaming dirty data length */ }; #define NETFS_FOLIO_INFO 0x1UL /* OR'd with folio->private. */ #define NETFS_FOLIO_COPY_TO_CACHE ((struct netfs_group *)0x356UL) /* Write to the cache only */ static inline bool netfs_is_folio_info(const void *priv) { return (unsigned long)priv & NETFS_FOLIO_INFO; } static inline struct netfs_folio *__netfs_folio_info(const void *priv) { if (netfs_is_folio_info(priv)) return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO); return NULL; } static inline struct netfs_folio *netfs_folio_info(struct folio *folio) { return __netfs_folio_info(folio_get_private(folio)); } static inline struct netfs_group *netfs_folio_group(struct folio *folio) { struct netfs_folio *finfo; void *priv = folio_get_private(folio); finfo = netfs_folio_info(folio); if (finfo) return finfo->netfs_group; return priv; } /* * Stream of I/O subrequests going to a particular destination, such as the * server or the local cache. This is mainly intended for writing where we may * have to write to multiple destinations concurrently. */ struct netfs_io_stream { /* Submission tracking */ struct netfs_io_subrequest *construct; /* Op being constructed */ unsigned int submit_off; /* Folio offset we're submitting from */ unsigned int submit_len; /* Amount of data left to submit */ unsigned int submit_max_len; /* Amount I/O can be rounded up to */ void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_subrequest *front; /* Op being collected */ unsigned long long collected_to; /* Position we've collected results to */ size_t transferred; /* The amount transferred from this stream */ enum netfs_io_source source; /* Where to read from/write to */ unsigned short error; /* Aggregate error for the stream */ unsigned char stream_nr; /* Index of stream in parent table */ bool avail; /* T if stream is available */ bool active; /* T if stream is active */ bool need_retry; /* T if this stream needs retrying */ bool failed; /* T if this stream failed */ }; /* * Resources required to do operations on a cache. */ struct netfs_cache_resources { const struct netfs_cache_ops *ops; void *cache_priv; void *cache_priv2; unsigned int debug_id; /* Cookie debug ID */ unsigned int inval_counter; /* object->inval_counter at begin_op */ }; /* * Descriptor for a single component subrequest. Each operation represents an * individual read/write from/to a server, a cache, a journal, etc.. * * The buffer iterator is persistent for the life of the subrequest struct and * the pages it points to can be relied on to exist for the duration. */ struct netfs_io_subrequest { struct netfs_io_request *rreq; /* Supervising I/O request */ struct work_struct work; struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ unsigned long long start; /* Where to start the I/O */ size_t max_len; /* Maximum size of the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ #define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ #define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_WRITE, /* This is a direct I/O write */ nr__netfs_io_origin } __mode(byte); /* * Descriptor for an I/O helper request. This is used to make multiple I/O * operations to a variety of data stores and then stitch the result together. */ struct netfs_io_request { union { struct work_struct work; struct rcu_head rcu; }; struct inode *inode; /* The file being accessed */ struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ unsigned int wsize; /* Maximum write size (0 for none) */ atomic_t subreq_counter; /* Next subreq->debug_index */ unsigned int nr_group_rel; /* Number of refs to release on ->group */ spinlock_t lock; /* Lock for queuing subreqs */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t upper_len; /* Length can be extended to here */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */ unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ refcount_t ref; unsigned long flags; #define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */ #define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; void (*cleanup)(struct netfs_io_request *req); }; /* * Operations the network filesystem can/must provide to the helpers. */ struct netfs_request_ops { mempool_t *request_pool; mempool_t *subrequest_pool; int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); void (*free_subrequest)(struct netfs_io_subrequest *rreq); /* Read request handling */ void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); /* Modification handling */ void (*update_i_size)(struct inode *inode, loff_t i_size); void (*post_modify)(struct inode *inode); /* Write request handling */ void (*begin_writeback)(struct netfs_io_request *wreq); void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); void (*retry_request)(struct netfs_io_request *wreq, struct netfs_io_stream *stream); void (*invalidate_cache)(struct netfs_io_request *wreq); }; /* * How to handle reading from a hole. */ enum netfs_read_from_hole { NETFS_READ_HOLE_IGNORE, NETFS_READ_HOLE_CLEAR, NETFS_READ_HOLE_FAIL, }; /* * Table of operations for access to a cache. */ struct netfs_cache_ops { /* End an operation */ void (*end_operation)(struct netfs_cache_resources *cres); /* Read data from the cache */ int (*read)(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, enum netfs_read_from_hole read_hole, netfs_io_terminated_t term_func, void *term_func_priv); /* Write data to the cache */ int (*write)(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, netfs_io_terminated_t term_func, void *term_func_priv); /* Write data to the cache from a netfs subrequest. */ void (*issue_write)(struct netfs_io_subrequest *subreq); /* Expand readahead request */ void (*expand_readahead)(struct netfs_cache_resources *cres, unsigned long long *_start, unsigned long long *_len, unsigned long long i_size); /* Prepare a read operation, shortening it to a cached/uncached * boundary as appropriate. */ enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq, unsigned long long i_size); /* Prepare a write subrequest, working out if we're allowed to do it * and finding out the maximum amount of data to gather before * attempting to submit. If we're not permitted to do it, the * subrequest should be marked failed. */ void (*prepare_write_subreq)(struct netfs_io_subrequest *subreq); /* Prepare a write operation, working out what part of the write we can * actually do. */ int (*prepare_write)(struct netfs_cache_resources *cres, loff_t *_start, size_t *_len, size_t upper_len, loff_t i_size, bool no_space_allocated_yet); /* Prepare an on-demand read operation, shortening it to a cached/uncached * boundary as appropriate. */ enum netfs_io_source (*prepare_ondemand_read)(struct netfs_cache_resources *cres, loff_t start, size_t *_len, loff_t i_size, unsigned long *_flags, ino_t ino); /* Query the occupancy of the cache in a region, returning where the * next chunk of data starts and how long it is. */ int (*query_occupancy)(struct netfs_cache_resources *cres, loff_t start, size_t len, size_t granularity, loff_t *_data_start, size_t *_data_len); }; /* High-level read API. */ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, struct netfs_group *netfs_group); ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from); ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); /* Address operations API */ struct readahead_control; void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, struct address_space *, loff_t pos, unsigned int len, struct folio **, void **fsdata); int netfs_writepages(struct address_space *mapping, struct writeback_control *wbc); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool netfs_release_folio(struct folio *folio, gfp_t gfp); /* VMA operations API. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, enum netfs_sreq_ref_trace what); ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq); void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, bool was_async); void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode); int netfs_start_io_write(struct inode *inode); void netfs_end_io_write(struct inode *inode); int netfs_start_io_direct(struct inode *inode); void netfs_end_io_direct(struct inode *inode); /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query * * Get the netfs lib inode context from the network filesystem's inode. The * context struct is expected to directly follow on from the VFS inode struct. */ static inline struct netfs_inode *netfs_inode(struct inode *inode) { return container_of(inode, struct netfs_inode, inode); } /** * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise * @ops: The netfs's operations list * @use_zero_point: True to use the zero_point read optimisation * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ static inline void netfs_inode_init(struct netfs_inode *ctx, const struct netfs_request_ops *ops, bool use_zero_point) { ctx->ops = ops; ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { ctx->zero_point = ctx->remote_i_size; mapping_set_release_always(ctx->inode.i_mapping); } } /** * netfs_resize_file - Note that a file got resized * @ctx: The netfs inode being resized * @new_i_size: The new file size * @changed_on_server: The change was applied to the server * * Inform the netfs lib that a file got resized so that it can adjust its state. */ static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, bool changed_on_server) { if (changed_on_server) ctx->remote_i_size = new_i_size; if (new_i_size < ctx->zero_point) ctx->zero_point = new_i_size; } /** * netfs_i_cookie - Get the cache cookie from the inode * @ctx: The netfs inode to query * * Get the caching cookie (if enabled) from the network filesystem's inode. */ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) { #if IS_ENABLED(CONFIG_FSCACHE) return ctx->cache; #else return NULL; #endif } /** * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete * @inode: The netfs inode to wait on * * Wait for outstanding I/O requests of any type to complete. This is intended * to be called from inode eviction routines. This makes sure that any * resources held by those requests are cleaned up before we let the inode get * cleaned up. */ static inline void netfs_wait_for_outstanding_io(struct inode *inode) { struct netfs_inode *ictx = netfs_inode(inode); wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); } #endif /* _LINUX_NETFS_H */
80 80 43 43 1 48 6 6 6 6 6 5 5 3 9 48 48 11 48